bibliography-sorted.bib

@inproceedings{ports19:_when_shoul_networ_be_comput,
  address = {Bertinoro, Italy},
  author = {Dan R. K. Ports and Jacob Nelson},
  booktitle = {Proceedings of the 17th Workshop on Hot Topics in
		  Operating Systems ({HotOS} '19)},
  month = may,
  organization = {{ACM}},
  title = {When Should The Network Be The Computer?},
  year = {2019},
  abstract = {Researchers have repurposed programmable network devices
		  to place small amounts of application computation in the
		  network, sometimes yielding orders-of-magnitude performance
		  gains. At the same time, effectively using these devices
		  requires careful use of limited resources and managing
		  deployment challenges. 

This paper provides a framework for principled use of in-network processing. We provide a set of guidelines for building robust and deployable in-network primives, along with a taxonomy to help identify which applications can benefit from in-network processing and what types of devices they should use.}, pdf = {papers/innetwork-hotos19.pdf}, monthnum = {05} }

@inproceedings{michael18:_towar_causal_datac_networ,
  address = {Porto, Portugal},
  author = {Ellis Michael and Dan R. K. Ports},
  booktitle = {Proceedings of the 2018 Workshop on Principles and
		  Practice of Consistency for Distributed Data ({PaPoC}
		  '18)},
  month = apr,
  organization = {{ACM}},
  title = {Towards Causal Datacenter Networks},
  year = {2018},
  abstract = {Traditionally, distributed systems conservatively assume
		  an asynchronous network. However, recent work on the
		  co-design of networks and distributed systems has shown
		  that stronger ordering properties are achievable in
		  datacenter networks and yield performance improvements for
		  the distributed systems they support. We build on that
		  trend and ask whether it is possible for the datacenter
		  network to order all messages in a protocol-agnostic way.
		  This approach, which we call omnisequencing, would ensure
		  causal delivery of all messages, making consistency a
		  network-level guarantee.},
  pdf = {papers/causal-papoc18.pdf},
  monthnum = {04}
}
@inproceedings{gudmundsdottir17:_demon_inter_analy_perfor_measur_viska,
  address = {Chicago, IL, USA},
  author = {Helga Gudmundsdottir and Babak Salimi and Magdalena
		  Balazinska and Dan R. K. Ports and Dan Suciu},
  booktitle = {Proceedings of the 2017 {ACM} {SIGMOD} {I}nternational
		  {C}onference on {M}anagement of {D}ata},
  month = may,
  note = {Demonstration},
  organization = {{ACM}},
  title = {A Demonstration of Interactive Analysis of Performance
		  Measurements with {Viska}},
  year = {2017},
  abstract = {The ultimate goal of system performance analysis is to
		  identify the underlying causes for performance
		  differences between different systems and different
		  workloads. We make it easier to achieve this goal with
		  Viska, a new tool for generating and interpreting
		  performance measurement results. and Viska leverages
		  cutting-edge techniques from big data analytics and data
		  visualization to aid and automate this analysis, and helps
		  users derive meaningful and statistically sound conclusions
		  using state-of-the-art causal inference and hypothesis
		  testing techniques.},
  pdf = {papers/viska-sigmod17demo.pdf},
  monthnum = {05}
}
@inproceedings{li17:_eris,
  address = {Shanghai, China},
  author = {Jialin Li and Ellis Michael and Dan R. K. Ports},
  booktitle = {Proceedings of the 26th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '17)},
  month = oct,
  organization = {{ACM}},
  title = {{Eris}: Coordination-Free Consistent Transactions using
		  Network Multi-Sequencing},
  year = {2017},
  abstract = {Distributed storage systems aim to provide strong
		  consistency and isolation guarantees on an architecture
		  that is partitioned across multiple shards for scalability
		  and replicated for fault-tolerance. Traditionally,
		  achieving all of these goals has required an expensive
		  combination of atomic commitment and replication protocols
		  -- introducing extensive coordination overhead. Our system,
		  Eris, takes a very different approach. It moves a core
		  piece of concurrency control functionality, which we term
		  multi-sequencing, into the datacenter network itself. This
		  network primitive takes on the responsibility for
		  consistently ordering transactions, and a new lightweight
		  transaction protocol ensures atomicity. The end result is
		  that Eris avoids both replication and transaction
		  coordination overhead: we show that it can process a large
		  class of distributed transactions in a single round-trip
		  from the client to the storage system without any explicit
		  coordination between shards or replicas. It provides
		  atomicity, consistency, and fault-tolerance with less than
		  10\% overhead -- achieving throughput 4.5--35x higher and
		  latency 72--80\% lower than a conventional design on
		  standard benchmarks.},
  pdf = {papers/eris-sosp17.pdf},
  monthnum = {10}
}
@inproceedings{michael17:_recov_shared_objec_without_stabl_storag,
  address = {Vienna, Austria},
  author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
		  and Adriana Szekeres},
  booktitle = {Proceedings of the 31st International Symposium on
		  Distributed Computing ({DISC} '17)},
  key = {DISC '17},
  month = oct,
  title = {Recovering Shared Objects Without Stable Storage},
  year = {2017},
  abstract = {This paper considers the problem of building
		  fault-tolerant shared objects when processes can crash and
		  recover but lose their persistent state on recovery. This
		  Diskless Crash-Recovery (DCR) model matches the way many
		  long-lived systems are built. We show that it presents new
		  challenges, as operations that are recorded at a quorum may
		  not persist after some of the processes in that quorum
		  crash and then recover. 

To address this problem, we introduce the notion of crash-consistent quorums, where no recoveries happen during the quorum responses. We show that relying on crash-consistent quorums enables a recovery procedure that can recover all operations that successfully finished. Crash-consistent quorums can be easily identified using a mechanism we term the crash vector, which tracks the causal relationship between crashes, recoveries, and other operations.

We apply crash-consistent quorums and crash vectors to build two storage primitives. We give a new algorithm for multi-reader multi-writer atomic registers in the DCR model that guarantees safety under all conditions and termination under a natural condition. It improves on the best prior protocol for this problem by requiring fewer rounds, fewer nodes to participate in the quorum, and a less restrictive liveness condition. We also present a more efficient single-reader, single-writer atomic set---a virtual stable storage abstraction. It can be used to lift any existing algorithm from the traditional Crash-Recovery with Stable Storage model to the DCR model. We examine a specific application, state machine replication, and show that existing diskless protocols can violate their correctness guarantees, while ours offers a general and correct solution.}, pdf = {papers/recovery-disc17.pdf}, monthnum = {10} }

@inproceedings{salimi17:_zaliq,
  author = {Babak Salimi and Corey Cole and Dan R. K. Ports and Dan
		  Suciu},
  booktitle = {Proceedings of the 43rd {I}nternational {C}onference on
		  {V}ery {L}arge {D}ata {B}ases ({VLDB} '17)},
  key = {{VLDB '17}},
  month = aug,
  note = {Demonstration.},
  title = {ZaliQL: Causal Inference from Observational Data at
		  Scale},
  year = {2017},
  abstract = {Causal inference from observational data is a subject of
		  active research and development in statistics and computer
		  science. Many statistical software packages have been
		  developed for this purpose. However, these toolkits do not
		  scale to large datasets. We propose and demonstrate ZaliQL:
		  a SQL-based framework for drawing causal inference from
		  observational data. ZaliQL supports the state-of-the-art
		  methods for causal inference and runs at scale within
		  PostgreSQL database system. In addition, we built a visual
		  interface to wrap around ZaliQL. In our demonstration, we
		  will use this GUI to show a live investigation of the
		  causal effect of different weather conditions on flight
		  delays.},
  pdf = {papers/zaliql-vldb17demo.pdf},
  monthnum = {08}
}
@inproceedings{gudmondsdottir16:_viska,
  address = {Savannah, GA, USA},
  author = {Helga Gudmundsdottir and Babak Salimi and Magdalena
		  Balazinska and Dan R. K. Ports and Dan Suciu},
  booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '16)},
  month = nov,
  note = {Poster},
  organization = {{USENIX}},
  title = {Viska: Enabling Interactive Analysis of Performance
		  Measurements},
  year = {2016},
  abstract = {Much of systems research consists of performance analysis
		  -- to learn when one system outperforms another, to
		  identify architectural choices responsible for the
		  difference, or to identify performance anomalies in
		  particular workloads, for example. However, despite recent
		  advances in data analytics and interactive data
		  visualization, the tools we use for performance analysis
		  remain remarkably primitive. 

The Viska project aims to close this gap by providing a new toolkit for systems researchers to generate and interpret performance measurement results, helping users derive meaningful and statistically sound conclusions. Viska leverages cutting-edge techniques from big data analytics and data visualization to aid and automate this analysis.}, monthnum = {11} }

@inproceedings{holt16:_discip_incon_consis_types,
  address = {Santa Clara, CA, USA},
  author = {Brandon Holt and James Bornholt and Irene Zhang and Dan R.
		  K. Ports and Mark Oskin and Luis Ceze},
  booktitle = {Proceedings of the 7th Symposium on Cloud Computing
		  ({SOCC} '16)},
  month = oct,
  organization = {{ACM}},
  title = {Disciplined Inconsistency with Consistency Types},
  year = {2016},
  abstract = {Distributed applications and web services, such as online
		  stores or social networks, are expected to be scalable,
		  available, responsive, and fault-tolerant. To meet these
		  steep requirements in the face of high round-trip
		  latencies, network partitions, server failures, and load
		  spikes, applications use eventually consistent datastores
		  that allow them to weaken the consistency of some data.
		  However, making this transition is highly error-prone
		  because relaxed consistency models are notoriously
		  difficult to understand and test. 

In this work, we propose a new programming model for distributed data that makes consistency properties explicit and uses a type system to enforce consistency safety. With the Inconsistent, Performance-bound, Approximate (IPA) storage system, programmers specify performance targets and correctness requirements as constraints on persistent data structures and handle uncertainty about the result of datastore reads using new consistency types. We implement a prototype of this model in Scala on top of an existing datastore, Cassandra, and use it to make performance/correctness tradeoffs in two applications: a ticket sales service and a Twitter clone. Our evaluation shows that IPA prevents consistency-based programming errors and adapts consistency automatically in response to changing network conditions, performing comparably to weak consistency and 2-10x faster than strong consistency.}, pdf = {papers/ipa-socc16.pdf}, monthnum = {10} }

@inproceedings{li16:_fast_replic_nopax,
  address = {Savannah, GA, USA},
  author = {Jialin Li and Ellis Michael and Adriana Szekeres and
		  Naveen Kr. Sharma and Dan R. K. Ports},
  booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '16)},
  month = nov,
  organization = {{USENIX}},
  title = {Just Say {NO} to {Paxos} Overhead: Replacing Consensus
		  with Network Ordering},
  year = {2016},
  abstract = {Distributed applications use replication, implemented by
		  protocols like Paxos, to ensure data availability and
		  transparently mask server failures. This paper presents a
		  new approach to achieving replication in the data center
		  without the performance cost of traditional methods. Our
		  work carefully divides replication responsibility between
		  the network and protocol layers. The network orders
		  requests but does not ensure reliable delivery -- using a
		  new primitive we call ordered unreliable multicast (OUM).
		  Implementing this primitive can be achieved with
		  near-zero-cost in the data center. Our new replication
		  protocol, Network-Ordered Paxos (NOPaxos), exploits network
		  ordering to provide strongly consistent replication without
		  coordination. The resulting system not only outperforms
		  both latency- and throughput-optimized protocols on their
		  respective metrics, but also yields throughput within 2\%
		  and latency within 16 us of an unreplicated system --
		  providing replication without the performance cost.},
  pdf = {papers/nopaxos-osdi16.pdf},
  code = {https://github.com/uwsyslab/nopaxos/},
  monthnum = {11}
}
@inproceedings{holt15:_claret,
  address = {Bordeaux, France},
  author = {Brandon Holt and Irene Zhang and Dan R. K. Ports and Mark
		  Oskin and Luis Ceze},
  booktitle = {Proceedings of the 2015 Workshop on Principles and
		  Practice of Consistency for Distributed Data ({PaPoC}
		  '15)},
  month = apr,
  organization = {{ACM}},
  title = {Claret: Using Data Types for Highly Concurrent Distributed
		  Transactions},
  year = {2015},
  abstract = {Out of the many NoSQL databases in use today, some that
		  provide simple data structures for records, such as Redis
		  and MongoDB, are now becoming popular. Building
		  applications out of these complex data types provides a way
		  to communicate intent to the database system without
		  sacrificing flexibility or committing to a fixed schema.
		  Currently this capability is leveraged in limited ways,
		  such as to ensure related values are co-located, or for
		  atomic updates. There are many ways data types can be used
		  to make databases more efficient that are not yet being
		  exploited. 

We explore several ways of leveraging abstract data type (ADT) semantics in databases, focusing primarily on commutativity. Using a Twitter clone as a case study, we show that using commutativity can reduce transaction abort rates for high-contention, update-heavy workloads that arise in real social networks. We conclude that ADTs are a good abstraction for database records, providing a safe and expressive programming model with ample opportunities for optimization, making databases more safe and scalable.}, pdf = {papers/claret-papoc15.pdf}, monthnum = {04} }

@inproceedings{holt15:_claret_poster,
  address = {Bordeaux, France},
  author = {Brandon Holt and Irene Zhang and Dan R. K. Ports and Mark
		  Oskin and Luis Ceze},
  booktitle = {Proceedings of the 10th {ACM} {SIGOPS} {E}uro{S}ys
		  ({EuroSys '15})},
  month = apr,
  note = {Poster},
  organization = {{ACM}},
  title = {Claret: Using Data Types for Highly Concurrent Distributed
		  Transactions},
  year = {2015},
  monthnum = {04}
}
@inproceedings{ports15:_desig_distr_system_using_approx,
  address = {Oakland, CA, USA},
  author = {Dan R. K. Ports and Jialin Li and Vincent Liu and Naveen
		  Kr. Sharma and Arvind Krishnamurthy},
  booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
		  {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
		  '15)},
  month = may,
  organization = {{USENIX}},
  title = {Designing Distributed Systems Using Approximate Synchrony
		  in Datacenter Networks},
  year = {2015},
  abstract = {Distributed systems are traditionally designed
		  independently from the underlying network, making
		  worst-case assumptions (e.g., complete asynchrony) about
		  its behavior. However, many of today's distributed
		  applications are deployed in data centers, where the
		  network is more reliable, predictable, and extensible. In
		  these environments, it is possible to co-design distributed
		  systems with their network layer, and doing so can offer
		  substantial benefits. 

This paper explores network-level mechanisms for providing Mostly-Ordered Multicast (MOM): a best-effort ordering property for concurrent multicast operations. Using this primitive, we design Speculative Paxos, a state machine replication protocol that relies on the network to order requests in the normal case. This approach leads to substantial performance benefits: under realistic data center conditions, Speculative Paxos can provide 40\% lower latency and 2.6x higher throughput than the standard Paxos protocol. It offers lower latency than a latency-optimized protocol (Fast Paxos) with the same throughput as a throughput-optimized protocol (batching).}, pdf = {papers/specpaxos-nsdi15.pdf}, slidespdf = {papers/specpaxos-nsdi15-slides.pdf}, award = {Best Paper Award}, code = {https://github.com/uwsyslab/specpaxos/}, monthnum = {05} }

@inproceedings{sharma15:_transtorm_poster,
  address = {Monterey, CA, USA},
  author = {Naveen Kr. Sharma and Brandon Holt and Irene Zhang and Dan
		  R. K. Ports and Marcos Aguilera},
  booktitle = {Proceedings of the 25th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '15)},
  month = oct,
  note = {Poster},
  organization = {{ACM}},
  title = {Transtorm: a benchmark suite for transactional key-value
		  storage systems},
  year = {2015},
  monthnum = {10}
}
@inproceedings{zhang15:_build_consis_trans_incon_replic,
  address = {Monterey, CA, USA},
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  booktitle = {Proceedings of the 25th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '15)},
  month = oct,
  organization = {{ACM}},
  title = {Building Consistent Transactions with Inconsistent
		  Replication},
  year = {2015},
  abstract = {Application programmers increasingly prefer distributed
		  storage systems with strong consistency and distributed
		  transactions (e.g., Google's Spanner) for their strong
		  guarantees and ease of use. Unfortunately, existing
		  transactional storage systems are expensive to use -- in
		  part because they require costly replication protocols,
		  like Paxos, for fault tolerance. In this paper, we present
		  a new approach that makes transactional storage systems
		  more affordable: we eliminate consistency from the
		  replication protocol while still providing distributed
		  transactions with strong consistency to applications. 

We present TAPIR -- the Transactional Application Protocol for Inconsistent Replication -- the first transaction protocol to use a novel replication protocol, called inconsistent replication, that provides fault tolerance without consistency. By enforcing strong consistency only in the transaction protocol, TAPIR can commit transactions in a single round-trip and order distributed transactions without centralized coordination. We demonstrate the use of TAPIR in a transactional key-value store, TAPIR-KV. Compared to conventional systems, TAPIR-KV provides better latency and throughput.}, pdf = {papers/tapir-sosp15.pdf}, code = {https://github.com/uwsyslab/tapir}, monthnum = {10} }

@inproceedings{li14:_tales_tail,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Naveen Kr. Sharma and Dan R. K. Ports and
		  Steven D. Gribble},
  booktitle = {Proceedings of the 5th Symposium on Cloud Computing
		  ({SOCC} '14)},
  month = nov,
  organization = {{ACM}},
  title = {Tales of the Tail: Hardware, {OS}, and Application-level
		  Sources of Tail Latency},
  year = {2014},
  abstract = {Interactive services often have large-scale parallel
		  implementations. To deliver fast responses, the median and
		  tail latencies of a service's components must be low. In
		  this paper, we explore the hardware, OS, and
		  application-level sources of poor tail latency in high
		  throughput servers executing on multi-core machines. 

We model these network services as a queuing system in order to establish the best-achievable latency distribution. Using fine-grained measurements of three different servers (a null RPC service, Memcached, and Nginx) on Linux, we then explore why these servers exhibit significantly worse tail latencies than queuing models alone predict. The underlying causes include interference from background processes, request re-ordering caused by poor scheduling or constrained concurrency models, suboptimal interrupt routing, CPU power saving mechanisms, and NUMA effects.

We systematically eliminate these factors and show that Memcached can achieve a median latency of 11 us and a 99.9th percentile latency of 32 us at 80\% utilization on a four-core system. In comparison, a naive deployment of Memcached at the same utilization on a single-core system has a median latency of 100 us and a 99.9th percentile latency of 5 ms. Finally, we demonstrate that tradeoffs exist between throughput, energy, and tail latency.}, pdf = {papers/latency-socc14.pdf}, monthnum = {11} }

@inproceedings{peter14:_arrak,
  address = {Broomfield, CO, USA},
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Doug Woos and Arvind Krishnamurthy and Thomas
		  Anderson and Timothy Roscoe},
  booktitle = {Proceedings of the 11th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '14)},
  month = oct,
  organization = {{USENIX}},
  title = {Arrakis: The Operating System is the Control Plane},
  year = {2014},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network server operating systems. In a
		  traditional operating system, the kernel mediates access to
		  device hardware by server applications, to enforce process
		  isolation as well as network and disk security. We have
		  designed and implemented a new operating system, Arrakis,
		  that splits the traditional role of the kernel in two.
		  Applications have direct access to virtualized I/O devices,
		  allowing most I/O operations to skip the kernel entirely,
		  while the kernel is re-engineered to provide network and
		  disk protection without kernel mediation of every
		  operation. We describe the hardware and software changes
		  needed to take advantage of this new abstraction, and we
		  illustrate its power by showing improvements of 2-5x in
		  latency and 9x in throughput for a popular persistent NoSQL
		  store relative to a well-tuned Linux implementation},
  pdf = {papers/arrakis-osdi14.pdf},
  award = {Jay Lepreau Best Paper Award},
  code = {https://github.com/UWNetworksLab/arrakis},
  monthnum = {10}
}
@inproceedings{peter14:_towar_high_perfor_applic_level_storag_manag,
  address = {Philadelphia, PA, USA},
  author = {Simon Peter and Jialin Li and Doug Woos and Irene Zhang
		  and Dan R. K. Ports and Thomas Anderson and Arvind
		  Krishnamurthy and Mark Zbikowski},
  booktitle = {Proceedings of the 5th Hot Topics in Storage and File
		  Systems ({HotStorage} '14)},
  month = jun,
  organization = {{USENIX}},
  title = {Towards High-Performance Application-Level Storage
		  Management},
  year = {2014},
  abstract = {We propose a radical re-architecture of the traditional
		  operating system storage stack to move the kernel off the
		  data path. Leveraging virtualized I/O hardware for disk and
		  flash storage, most read and write I/O operations go
		  directly to application code. The kernel dynamically
		  allocates extents, manages the virtual to physical binding,
		  and performs name translation. The benefit is to
		  dramatically reduce the CPU overhead of storage operations
		  while improving application flexibility.},
  pdf = {papers/arrakis-hotstorage14.pdf},
  monthnum = {06}
}
@inproceedings{zhang14:_optim_replic_two_phase_commit,
  address = {Beijing, China},
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  booktitle = {Proceedings of the 5th Asia-Pacific Workshop on Systems
		  ({APSYS} '14)},
  key = {APSys 2014},
  month = jun,
  note = {Poster and extended abstract},
  title = {Optimistic Replicated Two-Phase Commit},
  year = {2014},
  monthnum = {06}
}
@inproceedings{zhuo14:_machin_fault_toler_reliab_datac_system,
  address = {Beijing, China},
  author = {Danyang Zhuo and Qiao Zhang and Dan R. K. Ports and Arvind
		  Krishnamurthy and Thomas Anderson},
  booktitle = {Proceedings of the 5th Asia-Pacific Workshop on Systems
		  ({APSYS} '14)},
  key = {APSys 2014},
  month = jun,
  title = {Machine Fault Tolerance for Reliable Datacenter Systems},
  year = {2014},
  abstract = {Although rare in absolute terms, undetected CPU, memory,
		  and disk errors occur often enough at data center scale to
		  significantly affect overall system reliability and
		  availability. In this paper, we propose a new failure
		  model, called Machine Fault Tolerance, and a new
		  abstraction, a replicated write-once trusted table, to
		  provide improved resilience to these types of failures.
		  Since most machine failures manifest in application server
		  and operating system code, we assume a Byzantine model for
		  those parts of the system. However, by assuming that the
		  hypervisor and network are trustworthy, we are able to
		  reduce the overhead of machine-fault masking to be close to
		  that of non-Byzantine Paxos.},
  pdf = {papers/mft-apsys14.pdf},
  monthnum = {06}
}
@inproceedings{cheng12:_abstr_usabl_infor_flow_contr_aeolus,
  address = {Boston, MA, USA},
  author = {Winnie Cheng and Dan R. K. Ports and David Schultz and
		  Victoria Popic and Aaron Blankstein and James Cowling and
		  Dorothy Curtis and Liuba Shrira and Barbara Liskov},
  booktitle = {Proceedings of the 2012 {USENIX} {A}nnual {T}echnical
		  {C}onference},
  month = jun,
  organization = {{USENIX}},
  title = {Abstractions for Usable Information Flow Control in
		  {Aeolus}},
  year = {2012},
  abstract = {Despite the increasing importance of protecting
		  confidential data, building secure software remains as
		  challenging as ever. This paper describes Aeolus, a new
		  platform for building secure distributed applications.
		  Aeolus uses information flow control to provide
		  confidentiality and data integrity. It differs from
		  previous information flow control systems in a way that we
		  believe makes it easier to understand and use. Aeolus uses
		  a new, simpler security model, the first to combine a
		  standard principal-based scheme for authority management
		  with thread-granularity information flow tracking. The
		  principal hierarchy matches the way developers already
		  reason about authority and access control, and the
		  coarse-grained information flow tracking eases the task of
		  defining a program's security restrictions. In addition,
		  Aeolus provides a number of new mechanisms (authority
		  closures, compound tags, boxes, and shared volatile state)
		  that support common design patterns in secure application
		  design.},
  pdf = {papers/aeolus-usenix12.pdf},
  slidespdf = {papers/aeolus-usenix12-slides.pdf},
  code = {http://pmg.csail.mit.edu/aeolus/#sw},
  monthnum = {06}
}
@inproceedings{ports10:_trans_consis_autom_manag_applic_data_cache,
  address = {Vancouver, BC, Canada},
  author = {Dan R. K. Ports and Austin T. Clements and Irene Zhang and
		  Samuel Madden and Barbara Liskov},
  booktitle = {Proceedings of the 9th {USENIX} {S}ymposium on {O}perating
		  {S}ystems {D}esign and {I}mplementation ({OSDI} '10)},
  month = oct,
  organization = {{USENIX}},
  title = {Transactional Consistency and Automatic Management in an
		  Application Data Cache},
  year = {2010},
  abstract = {Distributed in-memory application data caches like
		  memcached are a popular solution for scaling
		  database-driven web sites. These systems are easy to add to
		  existing deployments, and increase performance
		  significantly by reducing load on both the database and
		  application servers. Unfortunately, such caches do not
		  integrate well with the database or the application. They
		  cannot maintain transactional consistency across the entire
		  system, violating the isolation properties of the
		  underlying database. They leave the application responsible
		  for locating data in the cache and keeping it up to date, a
		  frequent source of application complexity and programming
		  errors. 

Addressing both of these problems, we introduce a transactional cache, TxCache, with a simple programming model. TxCache ensures that any data seen within a transaction, whether it comes from the cache or the database, reflects a slightly stale but consistent snapshot of the database. TxCache makes it easy to add caching to an application by simply designating functions as cacheable; it automatically caches their results, and invalidates the cached data as the underlying database changes. Our experiments found that adding TxCache increased the throughput of a web application by up to 5.2x, only slightly less than a non-transactional cache, showing that consistency does not have to come at the price of performance.}, pdf = {papers/txcache-osdi10.pdf}, psgz = {papers/txcache-osdi10.ps.gz}, slidespdf = {papers/txcache-osdi10-slides.pdf}, code = {https://github.com/drkp/txcache/}, monthnum = {10} }

@inproceedings{cowling09:_census,
  address = {San Diego, CA, USA},
  author = {James Cowling and Dan R. K. Ports and Barbara Liskov and
		  Raluca Ada Popa and Abhijeet Gaikwad},
  booktitle = {Proceedings of the 2009 {USENIX} {A}nnual {T}echnical
		  {C}onference},
  month = jun,
  organization = {{USENIX}},
  title = {Census: Location-Aware Membership Management for
		  Large-Scale Distributed Systems},
  year = {2009},
  abstract = {We present Census, a platform for building large-scale
		  distributed applications. Census provides a membership
		  service and a multicast mechanism. The membership service
		  provides every node with a consistent view of the system
		  membership, which may be global or partitioned into
		  location-based regions. Census distributes membership
		  updates with low overhead, propagates changes promptly, and
		  is resilient to both crashes and Byzantine failures. We
		  believe that Census is the first system to provide a
		  consistent membership abstraction at very large scale,
		  greatly simplifying the design of applications built atop
		  large deployments such as multi-site data centers. 

Census builds on a novel multicast mechanism that is closely integrated with the membership service. It organizes nodes into a reliable overlay composed of multiple distribution trees, using network coordinates to minimize latency. Unlike other multicast systems, it avoids the cost of using distributed algorithms to construct and maintain trees. Instead, each node independently produces the same trees from the consistent membership view. Census uses this multicast mechanism to distribute membership updates, along with application-provided messages.

We evaluate the platform under simulation and on a real-world deployment on PlanetLab. We find that it imposes minimal bandwidth overhead, is able to react quickly to node failures and changes in the system membership, and can scale to substantial size.}, pdf = {papers/census-usenix09.pdf}, psgz = {papers/census-usenix09.ps.gz}, slidespdf = {papers/census-usenix09-slides.pdf}, monthnum = {06} }

@inproceedings{ports09:_trans_cachin_applic_data_recen_snaps,
  address = {Big Sky, MT, USA},
  author = {Dan R. K. Ports and Austin T. Clements and Irene Y. Zhang
		  and Samuel Madden and Barbara Liskov},
  booktitle = {Proceedings of the 22nd {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '09)},
  month = oct,
  note = {Work in Progress report},
  organization = {{ACM}},
  title = {Transactional Caching of Application Data using Recent
		  Snapshots},
  year = {2009},
  abstract = {Many of today's well-known websites use application data
		  caches to reduce the bottleneck load on the database, as
		  well as the computational load on the application servers.
		  Distributed in-memory shared caches, exemplified by
		  memcached, are one popular approach. These caches
		  typically provide a get/put interface, akin to a
		  distributed hash table; the application chooses what data
		  to keep in the cache and keeps it up to date. By storing
		  the cache entirely in memory and horizontally partitioning
		  among nodes, in-memory caches provide quick response times
		  and ease of scaling. 

However, existing caches have no notion of transactional consistency: there is no way to ensure that two accesses to the cache reflect a view of the database at the same point in time. While the backing database goes to great lengths to ensure this property (serializable isolation), the caching layer violates these guarantees. The resulting inconsistencies can have unpleasant consequences if exposed to the user (e.g., attributing the latest bid to the wrong user on an auction site), or add complexity to application code by forcing it to cope with temporarily violated invariants.

We argue that transactional semantics are not incompatible with cache performance and scalability. We introduce a transactional cache, TxCache, which guarantees that all values retrieved from the cache or database during a transaction reflect a consistent snapshot of the database.

TxCache also strives to simplify application design by helping manage the cache. Instead of requiring applications to manually insert and check for values in the cache, TxCache provides a library with which programmers simply designate functions as cacheable, and the library checks the cache for previous calls with the same arguments. In particular, and unlike memcached, TxCache does not require applications to explicitly invalidate cached values; correctly identifying the values to invalidate is difficult because it requires global reasoning about the application.}, pdf = {papers/txcache-sosp09wip-abstract.pdf}, psgz = {papers/txcache-sosp09wip-abstract.ps.gz}, slidespdf = {papers/txcache-sosp09wip-slides.pdf}, monthnum = {10} }

@inproceedings{chen08:_overs,
  address = {Seattle, WA, USA},
  author = {Xiaoxin Chen and Tal Garfinkel and E. Christopher Lewis
		  and Pratap Subrahmanyam and Carl A. Waldspurger and Dan
		  Boneh and Jeffrey Dwoskin and Dan R. K. Ports},
  booktitle = {Proceedings of the 13th {I}nternational {C}onference on
		  {A}rchitectural {S}upport for {P}rogramming {L}anguages and
		  {O}perating {S}ystems ({ASPLOS '08})},
  month = mar,
  organization = {{ACM}},
  title = {Overshadow: A Virtualization-Based Approach to
		  Retrofitting Protection in Commodity Operating Systems},
  year = {2008},
  abstract = {Commodity operating systems entrusted with securing
		  sensitive data are remarkably large and complex, and
		  consequently, frequently prone to compromise. To address
		  this limitation, we introduce a virtual-machine-based
		  system called Overshadow that protects the privacy and
		  integrity of application data, even in the event of a total
		  OS compromise. Overshadow presents an application with a
		  normal view of its resources, but the OS with an encrypted
		  view. This allows the operating system to carry out the
		  complex task of managing an application's resources,
		  without allowing it to read or modify them. Thus,
		  Overshadow offers a last line of defense for application
		  data. 

Overshadow builds on multi-shadowing, a novel mechanism that presents different views of ``physical'' memory, depending on the context performing the access. This primitive offers an additional dimension of protection beyond the hierarchical protection domains implemented by traditional operating systems and processor architectures.

We present the design and implementation of Overshadow and show how its new protection semantics can be integrated with existing systems. Our design has been fully implemented and used to protect a wide range of unmodified legacy applications running on an unmodified Linux operating system. We evaluate the performance of our implementation, demonstrating that this approach is practical.}, pdf = {papers/overshadow-asplos08.pdf}, psgz = {papers/overshadow-asplos08.ps.gz}, monthnum = {03} }

@inproceedings{ports08:_towar_applic_secur_untrus_operat_system,
  address = {San Jose, CA, USA},
  author = {Dan R. K. Ports and Tal Garfinkel},
  booktitle = {Proceedings of the 3rd Workshop on Hot Topics in Security
		  (HotSec '08)},
  month = jul,
  organization = {{USENIX}},
  title = {Towards Application Security on Untrusted Operating
		  Systems},
  year = {2008},
  abstract = {Complexity in commodity operating systems makes
		  compromises inevitable. Consequently, a great deal of work
		  has examined how to protect security-critical portions of
		  applications from the OS through mechanisms such as
		  microkernels, virtual machine monitors, and new processor
		  architectures. Unfortunately, most work has focused on CPU
		  and memory isolation and neglected OS semantics. Thus,
		  while much is known about how to prevent OS and application
		  processes from modifying each other, far less is understood
		  about how different OS components can undermine application
		  security if they turn malicious. 

We consider this problem in the context of our work on Overshadow, a virtual-machine-based system for retrofitting protection in commodity operating systems. We explore how malicious behavior in each major OS subsystem can undermine application security, and present potential mitigations. While our discussion is presented in terms of Overshadow and Linux, many of the problems and solutions are applicable to other systems where trusted applications rely on untrusted, potentially malicious OS components.}, pdf = {papers/overshadow-hotsec08.pdf}, psgz = {papers/overshadow-hotsec08.ps.gz}, slidespdf = {papers/overshadow-hotsec08-slides.pdf}, monthnum = {07} }

@inproceedings{clements05:_arpeg,
  address = {Ithaca, NY, USA},
  author = {Austin T. Clements and Dan R. K. Ports and David R.
		  Karger},
  booktitle = {Proceedings of the 4th International Workshop on
		  Peer-to-Peer Systems ({IPTPS} '05)},
  key = {IPTPS '05},
  month = feb,
  pages = {58--68},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  title = {Arpeggio: Metadata Searching and Content Sharing with
		  {C}hord},
  volume = {3640},
  year = {2005},
  abstract = {Arpeggio is a peer-to-peer file-sharing network based on
		  the Chord lookup primitive. Queries for data whose metadata
		  matches a certain criterion are performed efficiently by
		  using a distributed keyword-set index, augmented with
		  index-side filtering. We introduce index gateways, a
		  technique for minimizing index maintenance overhead.
		  Because file data is large, Arpeggio employs subrings to
		  track live source peers without the cost of inserting the
		  data itself into the network. Finally, we introduce
		  postfetching, a technique that uses information in the
		  index to improve the availability of rare files. The result
		  is a system that provides efficient query operations with
		  the scalability and reliability advantages of full
		  decentralization, and a content distribution system tuned
		  to the requirements and capabilities of a peer-to-peer
		  network.},
  pdf = {papers/arpeggio-iptps05.pdf},
  psgz = {papers/arpeggio-iptps05.ps.gz},
  slidespdf = {papers/arpeggio-iptps05-slides.pdf},
  monthnum = {02}
}
@inproceedings{ports05:_persif,
  address = {Brighton, United Kingdom},
  author = {Dan R. K. Ports and Austin T. Clements and Erik D.
		  Demaine},
  booktitle = {Proceedings of the 20th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '05)},
  month = oct,
  note = {Poster and extended abstract},
  organization = {{ACM}},
  title = {{PersiFS}: A Versioned File System with an Efficient
		  Representation},
  year = {2005},
  monthnum = {10}
}
@inproceedings{clements04:_arpeg,
  address = {Cambridge, MA, USA},
  author = {Austin T. Clements and Dan R. K. Ports and David R.
		  Karger},
  booktitle = {Proceedings of the 2nd Project IRIS Student Workshop
		  ({ISW} '04)},
  key = {ISW '04},
  month = nov,
  note = {Poster and extended abstract.},
  title = {Arpeggio: Efficient Metadata-based Searching and File
		  Transfer with {DHTs}},
  year = {2004},
  abstract = {Arpeggio is a peer-to-peer file-sharing network
		  based on the Chord distributed hash table. Queries for
		  files whose metadata matches a certain criterion are
		  performed efficiently by using a distributed
		  keyword-set index, augmented with index-side
		  filtering. We introduce metadata gateways, a
		  technique for minimizing index maintenance overhead.
		  Arpeggio also uses the DHT for indirect
		  storage of file contents, maintaining pointers from
		  content to the live peers that provide it. Finally, we
		  introduce postfetching, a technique that uses
		  information in the index to improve the availability of
		  rare files. The result is a system that provides efficient
		  query operations with the scalability and reliability
		  advantages of full decentralization, and a content
		  distribution system tuned to the requirements of a
		  peer-to-peer file-sharing network.},
  monthnum = {11}
}
@mastersthesis{ports07:_metad_index_in_struc_peer,
  address = {Cambridge, MA, USA},
  author = {Dan R. K. Ports},
  month = feb,
  school = {Massachusetts Institute of Technology},
  type = {M.Eng. thesis},
  title = {Arpeggio: Metadata Indexing in a Structured Peer-to-Peer
		  Network},
  year = {2007},
  abstract = {Peer-to-peer networks require an efficient means for
		  performing searches for files by metadata keywords.
		  Unfortunately, current methods usually sacrifice either
		  scalability or recall. Arpeggio is a peer-to-peer
		  file-sharing network that uses the Chord lookup primitive
		  as a basis for constructing distributed keyword-set index,
		  augmented with index-side filtering, to address this
		  problem. We introduce index gateways, a technique for
		  minimizing index maintenance overhead. Arpeggio also
		  includes a content distribution system for finding source
		  peers for a file; we present a novel system that uses Chord
		  subrings to track live source peers without the cost of
		  inserting the data itself into the network, and supports
		  postfetching: using information in the index to improve the
		  availability of rare files. The result is a system that
		  provides efficient query operations with the scalability
		  and reliability advantages of full decentralization. We use
		  analysis and simulation results to show that our indexing
		  system has reasonable storage and bandwidth costs, and
		  improves load distribution.},
  pdf = {papers/arpeggio-meng.pdf},
  psgz = {papers/arpeggio-meng.ps.gz},
  monthnum = {02}
}
@techreport{sapio19:_scalin_distr_machin_learn_in_networ_aggreg,
  author = {Amedeo Sapio and Marco Canini and Chen-Yu Ho and Jacob
		  Nelson and Panos Kalnis and Changhoon Kim and Arvind
		  Krishnamurthy and Masoud Moshref and Dan R. K. Ports and
		  Peter Richtarik},
  institution = {KAUST},
  month = feb,
  title = {Scaling Distributed Machine Learning with In-Network
		  Aggregation},
  year = {2019},
  abstract = {Training complex machine learning models in parallel is an
		  increasingly important workload. We accelerate distributed
		  parallel training by designing a communication primitive
		  that uses a programmable switch dataplane to execute a key
		  step of the training process. Our approach, SwitchML,
		  reduces the volume of exchanged data by aggregating the
		  model updates from multiple workers in the network. We
		  co-design the switch processing with the end-host protocols
		  and ML frameworks to provide a robust, efficient solution
		  that speeds up training by up to 300\%, and at least by
		  20\% for a number of real-world benchmark models.},
  pdf = {papers/switchml-tr19.pdf},
  monthnum = {02}
}
@techreport{zhu19:_harmon,
  author = {Hang Zhu and Zhihao Bai and Jialin Li and Ellis Michael
		  and Dan R. K. Ports and Ion Stoica and Xin Jin},
  institution = {arXiv},
  month = apr,
  number = {1904.08964},
  type = {arXiv preprint},
  title = {Harmonia: Near-Linear Scalability for Replicated Storage
		  with In-Network Conflict Detection},
  year = {2019},
  abstract = {Distributed storage employs replication to mask failures
		  and improve availability. However, these systems typically
		  exhibit a hard tradeoff between consistency and
		  performance. Ensuring consistency introduces coordination
		  overhead, and as a result the system throughput does not
		  scale with the number of replicas. We present Harmonia, a
		  replicated storage architecture that exploits the
		  capability of new-generation programmable switches to
		  obviate this tradeoff by providing near-linear scalability
		  without sacrificing consistency. To achieve this goal,
		  Harmonia detects read-write conflicts in the network, which
		  enables any replica to serve reads for objects with no
		  pending writes. Harmonia implements this functionality at
		  line rate, thus imposing no performance overhead. We have
		  implemented a prototype of Harmonia on a cluster of
		  commodity servers connected by a Barefoot Tofino switch,
		  and have integrated it with Redis. We demonstrate the
		  generality of our approach by supporting a variety of
		  replication protocols, including primary-backup, chain
		  replication, Viewstamped Replication, and NOPaxos.
		  Experimental results show that Harmonia improves the
		  throughput of these protocols by up to 10X for a
		  replication factor of 10, providing near-linear scalability
		  up to the limit of our testbed.},
  pdf = {papers/harmonia-arxiv19.pdf},
  monthnum = {04}
}
@techreport{li18:_pegas,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Jacob Nelson and Xin Jin and Dan R. K.
		  Ports},
  institution = {University of Washington CSE},
  month = dec,
  number = {UW-CSE-18-12-01},
  title = {Pegasus: Load-Aware Selective Replication with an
		  In-Network Coherence Directory},
  year = {2018},
  abstract = {High performance distributed storage systems face the
		  challenge of load imbalance caused by skewed and dynamic
		  workloads. This paper introduces Pegasus, a new storage
		  architecture that leverages new-generation programmable
		  switch ASICs to balance load across storage servers.
		  Pegasus uses selective replication of the most popular
		  objects in the data store to distribute load. Using a novel
		  in-network coherence directory, the Pegasus switch tracks
		  and manages the location of replicated objects. This allows
		  it to achive load-aware forwarding and dynamic rebalancing
		  for replicated keys, while still guaranteeing data
		  coherence. The Pegasus design is practical to implement as
		  it stores only forwarding metadata in the switch data
		  plane. The resulting system improves the 99\% tail latency
		  of a distributed in-memory key-value store by more than
		  95\%, and yields up to a 9x throughput improvement under a
		  latency SLO -- results which hold across a large set of
		  workloads with varying degrees of skewness, read/write
		  ratio, and dynamism.},
  pdf = {papers/pegasus-tr18.pdf},
  monthnum = {12}
}
@article{zhang18:_build_consis_trans_incon_replic,
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  journal = {{ACM} Transactions on Computer Systems},
  month = dec,
  number = {4},
  pages = {12},
  title = {Building Consistent Transactions with Inconsistent
		  Replication},
  year = {2018},
  abstract = {Application programmers increasingly prefer distributed
		  storage systems with strong consistency and distributed
		  transactions (e.g., Google's Spanner) for their strong
		  guarantees and ease of use. Unfortunately, existing
		  transactional storage systems are expensive to use -- in
		  part because they require costly replication protocols,
		  like Paxos, for fault tolerance. In this paper, we present
		  a new approach that makes transactional storage systems
		  more affordable: we eliminate consistency from the
		  replication protocol while still providing distributed
		  transactions with strong consistency to applications. 

We present TAPIR -- the Transactional Application Protocol for Inconsistent Replication -- the first transaction protocol to use a novel replication protocol, called inconsistent replication, that provides fault tolerance without consistency. By enforcing strong consistency only in the transaction protocol, TAPIR can commit transactions in a single round-trip and order distributed transactions without centralized coordination. We demonstrate the use of TAPIR in a transactional key-value store, TAPIR-KV. Compared to conventional systems, TAPIR-KV provides both better latency and better throughput.}, pdf = {papers/tapir-tocs18.pdf}, code = {https://github.com/uwsyslab/tapir/}, monthnum = {12} }

@techreport{li17:_eris_tr,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Ellis Michael and Dan R. K. Ports},
  institution = {University of Washington CSE},
  month = oct,
  number = {UW-CSE-TR-17-10-01},
  title = {{Eris}: Coordination-Free Consistent Transactions using
		  Network Multi-Sequencing (Extended Version)},
  year = {2017},
  abstract = {Distributed storage systems aim to provide strong
		  consistency and isolation guarantees on an architecture
		  that is partitioned across multiple shards for scalability
		  and replicated for fault-tolerance. Traditionally,
		  achieving all of these goals has required an expensive
		  combination of atomic commitment and replication protocols
		  -- introducing extensive coordination overhead. Our system,
		  Eris, takes a very different approach. It moves a core
		  piece of concurrency control functionality, which we term
		  multi-sequencing, into the datacenter network itself. This
		  network primitive takes on the responsibility for
		  consistently ordering transactions, and a new lightweight
		  transaction protocol ensures atomicity. The end result is
		  that Eris avoids both replication and transaction
		  coordination overhead: we show that it can process a large
		  class of distributed transactions in a single round-trip
		  from the client to the storage system without any explicit
		  coordination between shards or replicas. It provides
		  atomicity, consistency, and fault-tolerance with less than
		  10\% overhead -- achieving throughput 4.5--35x higher and
		  latency 72--80\% lower than a conventional design on
		  standard benchmarks.},
  pdf = {papers/eris-tr17.pdf},
  monthnum = {10}
}
@techreport{michael17:_recov_shared_objec_without_stabl,
  address = {Seattle, WA, USA},
  author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
		  and Adriana Szekeres},
  institution = {University of Washington CSE},
  month = aug,
  number = {UW-CSE-17-08-01},
  title = {Recovering Shared Objects Without Stable Storage (Extended
		  Version)},
  year = {2017},
  abstract = {This paper considers the problem of building
		  fault-tolerant shared objects when processes can crash and
		  recover but lose their persistent state on recovery. This
		  Diskless Crash-Recovery (DCR) model matches the way many
		  long-lived systems are built. We show that it presents new
		  challenges, as operations that are recorded at a quorum may
		  not persist after some of the processes in that quorum
		  crash and then recover. 

To address this problem, we introduce the notion of crash-consistent quorums, where no recoveries happen during the quorum responses. We show that relying on crash-consistent quorums enables a recovery procedure that can recover all operations that successfully finished. Crash-consistent quorums can be easily identified using a mechanism we term the crash vector, which tracks the causal relationship between crashes, recoveries, and other operations.

We apply crash-consistent quorums and crash vectors to build two storage primitives. We give a new algorithm for multi-reader multi-writer atomic registers in the DCR model that guarantees safety under all conditions and termination under a natural condition. It improves on the best prior protocol for this problem by requiring fewer rounds, fewer nodes to participate in the quorum, and a less restrictive liveness condition. We also present a more efficient single-reader, single-writer atomic set---a virtual stable storage abstraction. It can be used to lift any existing algorithm from the traditional Crash-Recovery with Stable Storage model to the DCR model. We examine a specific application, state machine replication, and show that existing diskless protocols can violate their correctness guarantees, while ours offers a general and correct solution.}, pdf = {papers/recovery-tr17.pdf}, monthnum = {08} }

@techreport{holt16:_discip_incon,
  address = {Seattle, WA, USA},
  author = {Brandon Holt and James Bornholt and Irene Zhang and Dan R.
		  K. Ports and Mark Oskin and Luis Ceze},
  institution = {University of Washington CSE},
  month = jun,
  number = {UW-CSE-TR-16-06-01},
  title = {Disciplined Inconsistency},
  year = {2016},
  abstract = {Distributed applications and web services, such as online
		  stores or social networks, are expected to be scalable,
		  available, responsive, and fault-tolerant. To meet these
		  steep requirements in the face of high round-trip
		  latencies, network partitions, server failures, and load
		  spikes, applications use eventually consistent datastores
		  that allow them to weaken the consistency of some data.
		  However, making this transition is highly error-prone
		  because relaxed consistency models are notoriously
		  difficult to understand and test. 

In this work, we propose a new programming model for distributed data that makes consistency properties explicit and uses a type system to enforce consistency safety. With the Inconsistent, Performance-bound, Approximate (IPA) storage system, programmers specify performance targets and correctness requirements as constraints on persistent data structures and handle uncertainty about the result of datastore reads using new *consistency types*. We implement a prototype of this model in Scala on top of an existing datastore, Cassandra, and use it to make performance/correctness tradeoffs in two applications: a ticket sales service and a Twitter clone. Our evaluation shows that IPA prevents consistency-based programming errors and adapts consistency automatically in response to changing network conditions, performing comparably to weak consistency and 2-10x faster than strong consistency.}, pdf = {papers/ipa-tr16.pdf}, supersededby = {holt16:_discip_incon_consis_types}, supersededas = {Tech. Report (2016)}, monthnum = {06} }

@techreport{li16:_fast_replic_nopax_tr,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Ellis Michael and Adriana Szekeres and
		  Naveen Kr. Sharma and Dan R. K. Ports},
  institution = {University of Washington CSE},
  number = {UW-CSE-TR-16-09-02},
  title = {Just Say {NO} to {Paxos} Overhead: Replacing Consensus
		  with Network Ordering (Extended Version)},
  year = {2016},
  abstract = {Distributed applications use replication, implemented by
		  protocols like Paxos, to ensure data availability and
		  transparently mask server failures. This paper presents a
		  new approach to achieving replication in the data center
		  without the performance cost of traditional methods. Our
		  work carefully divides replication responsibility between
		  the network and protocol layers. The network orders
		  requests but does not ensure reliable delivery -- using a
		  new primitive we call ordered unreliable multicast (OUM).
		  Implementing this primitive can be achieved with
		  near-zero-cost in the data center. Our new replication
		  protocol, Network-Ordered Paxos (NOPaxos), exploits network
		  ordering to provide strongly consistent replication without
		  coordination. The resulting system not only outperforms
		  both latency- and throughput-optimized protocols on their
		  respective metrics, but also yields throughput within 2\%
		  and latency within 16 us of an unreplicated system --
		  providing replication without the performance cost.},
  pdf = {papers/nopaxos-tr16.pdf},
  code = {https://github.com/uwsyslab/nopaxos/},
  monthnum = {}
}
@techreport{michael16:_provid_stabl_storag_diskl_crash,
  address = {Seattle, WA, USA},
  author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
		  and Adriana Szekeres},
  institution = {University of Washington CSE},
  month = aug,
  number = {UW-CSE-TR-16-08-02},
  title = {Providing Stable Storage for the Diskless Crash-Recovery
		  Failure Model},
  year = {2016},
  abstract = {Many classic protocols in the fault tolerant distributed
		  computing literature assume a Crash-Fail model in which
		  processes either are up, or have crashed and are
		  permanently down. While this model is useful, it does not
		  fully capture the difficulties many real systems must
		  contend with. In particular, real-world systems are
		  long-lived and must have a recovery mechanism so that
		  crashed processes can rejoin the system and restore its
		  fault-tolerance. When processes are assumed to have access
		  to stable storage that is persistent across failures, the
		  Crash-Recovery model is trivial. However, because disk
		  failures are common and because having a disk on a
		  protocol's critical path is often performance concern,
		  diskless recovery protocols are needed. While such
		  protocols do exist in the state machine replication
		  literature, several well-known protocols have flawed
		  recovery mechanisms. We examine these errors to elucidate
		  the problem of diskless recovery and present our own
		  protocol for providing virtual stable storage, transforming
		  any protocol in the Crash-Recovery with stable storage
		  model into a protocol in the Diskless Crash-Recover
		  model.},
  pdf = {papers/diskless-tr16.pdf},
  monthnum = {08}
}
@article{zhang16:_when_is_operat_order_requir,
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  journal = {{IEEE} Data Engineering Bulletin},
  month = mar,
  number = {1},
  pages = {27--38},
  title = {When Is Operation Ordering Required in Replicated
		  Transactional Storage?},
  volume = {39},
  year = {2016},
  abstract = {Today's replicated transactional storage systems typically
		  have a layered architecture, combining protocols for
		  transaction coordination, consistent replication, and
		  concurrency control. These systems generally require costly
		  strongly-consistent replication protocols like Paxos, which
		  assign a total order to all operations. To avoid this cost,
		  we ask whether all replicated operations in these systems
		  need to be strictly ordered. Recent research has yielded
		  replication protocols that can avoid unnecessary ordering,
		  e.g., by exploiting commutative operations, but it is not
		  clear how to apply these to replicated transaction
		  processing systems. We answer this question by analyzing
		  existing transaction processing designs in terms of which
		  replicated operations require ordering and which simply
		  require fault tolerance. We describe how this analysis
		  leads to our recent work on TAPIR, a transaction protocol
		  that efficiently provides strict serializability by using a
		  new replication protocol that provides fault tolerance but
		  not ordering for most operations.},
  pdf = {papers/ordering-debull16.pdf},
  monthnum = {03}
}
@article{peter15:_arrak,
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Doug Woos and Arvind Krishnamurthy and Thomas
		  Anderson and Timothy Roscoe},
  journal = {{ACM} Transactions on Computer Systems},
  month = nov,
  number = {4},
  title = {Arrakis: The Operating System Is the Control Plane},
  volume = {33},
  year = {2015},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network server operating systems. In a
		  traditional operating system, the kernel mediates access to
		  device hardware by server applications to enforce process
		  isolation as well as network and disk security. We have
		  designed and implemented a new operating system, Arrakis,
		  that splits the traditional role of the kernel in two.
		  Applications have direct access to virtualized I/O devices,
		  allowing most I/O operations to skip the kernel entirely,
		  while the kernel is re-engineered to provide network and
		  disk protection without kernel mediation of every
		  operation. We describe the hardware and software changes
		  needed to take advantage of this new abstraction, and we
		  illustrate its power by showing improvements of 2 to 5 x in
		  latency and 9x throughput for a popular persistent NoSQL
		  store relative to a well-tuned Linux implementation.},
  pdf = {papers/arrakis-tocs15.pdf},
  code = {https://github.com/UWNetworksLab/arrakis},
  monthnum = {11}
}
@techreport{zhang15:_build_consis_trans_incon_replic_exten_version,
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  institution = {University of Washington CSE},
  month = oct,
  number = {UW-CSE-2014-12-01 v2},
  title = {Building Consistent Transactions with Inconsistent
		  Replication (Extended Version)},
  year = {2015},
  abstract = {Application programmers increasingly prefer distributed
		  storage systems with strong consistency and distributed
		  transactions (e.g., Google's Spanner) for their strong
		  guarantees and ease of use. Unfortunately, existing
		  transactional storage systems are expensive to use -- in
		  part because they require costly replication protocols,
		  like Paxos, for fault tolerance. In this paper, we present
		  a new approach that makes transactional storage systems
		  more affordable: we eliminate consistency from the
		  replication protocol while still providing distributed
		  transactions with strong consistency to applications. 

We present TAPIR -- the Transactional Application Protocol for Inconsistent Replication -- the first transaction protocol to use a novel replication protocol, called inconsistent replication, that provides fault tolerance without consistency. By enforcing strong consistency only in the transaction protocol, TAPIR can commit transactions in a single round-trip and order distributed transactions without centralized coordination. We demonstrate the use of TAPIR in a transactional key-value store, TAPIR-KV. Compared to conventional systems, TAPIR-KV provides better latency and throughput.}, pdf = {papers/tapir-tr-v2.pdf}, code = {https://github.com/uwsyslab/tapir/}, monthnum = {10} }

@techreport{li14:_tales_tail_tr,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Naveen Kr. Sharma and Dan R. K. Ports and
		  Steven D. Gribble},
  institution = {University of Washington CSE},
  month = apr,
  number = {UW-CSE-14-04-01},
  title = {Tales of the Tail: Hardware, {OS}, and Application-level
		  Sources of Tail Latency},
  year = {2014},
  abstract = {Interactive services often have large-scale parallel
		  implementations. To deliver fast responses, the median and
		  tail latencies of a service's components must be low. In
		  this paper, we explore the hardware, OS, and
		  application-level sources of poor tail latency in high
		  throughput servers executing on multi-core machines. 

We first review the basic queuing theory that governs service latency. Using fine-grained measurements of three different servers (a null RPC service, Memcached, and Nginx) on Linux, we then explore why these servers exhibit significantly worse tail latencies than queuing models alone predict. The underlying causes include interference from background processes, request re-ordering caused by poor scheduling or constrained concurrency models, suboptimal interrupt routing, CPU power saving mechanisms, and NUMA effects.

We systematically eliminate these factors and show that Memcached can achieve a median latency of 11 us and a 99.9th percentile latency of 32 us at 75\% utilization. In comparison, a naive deployment of Memcached has a median latency of 33 us and a 99.9th percentile latency of 14 ms. Finally, we demonstrate that a tradeoff often exists between throughput and tail latency.}, pdf = {papers/latency-tr14.pdf}, supersededby = {li14:_tales_tail}, supersededas = {Tech. Report}, monthnum = {04} }

@techreport{peter14:_arrak_tr_v2,
  address = {Seattle, WA, USA},
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Arvind Krishnamurthy and Thomas Anderson and
		  Timothy Roscoe},
  institution = {University of Washington CSE},
  month = may,
  number = {UW-CSE-13-10-01, version 2.0},
  title = {Arrakis: The Operating System is the Control Plane},
  year = {2014},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network server operating systems. In a
		  traditional operating system, the kernel mediates access to
		  device hardware by server applications, to enforce process
		  isolation as well as network and disk security. We have
		  designed and implemented a new operating system, Arrakis,
		  that splits the traditional role of the kernel in two.
		  Applications have direct access to virtualized I/O devices,
		  allowing most I/O operations to skip the kernel entirely,
		  while the kernel is re-engineered to provide network and
		  disk protection without kernel mediation of every
		  operation. We describe the hardware and software changes
		  needed to take advantage of this new abstraction, and we
		  illustrate its power by showing 2-5x end-to-end latency and
		  9x throughput improvements for a popular persistent NoSQL
		  store relative to a well-tuned Linuxv implementation.},
  pdf = {papers/arrakis-tr-ver2.pdf},
  supersededby = {peter14:_arrak},
  supersededas = {Tech. Report (v2.0, 2014)},
  monthnum = {05}
}
@techreport{zhang14:_build_consis_trans_incon_replic,
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  institution = {University of Washington CSE},
  month = dec,
  number = {UW-CSE-2014-12-01},
  title = {Building Consistent Transactions with Inconsistent
		  Replication},
  year = {2014},
  pdf = {papers/tapir-tr14.pdf},
  supersededby = {zhang15:_build_consis_trans_incon_replic_exten_version},
  supersededas = {Tech. Report (2014)},
  monthnum = {12}
}
@techreport{hornyack13:_study_virtual_memor_usage_implic_large_memor,
  address = {Seattle, WA},
  author = {Peter Hornyack and Luis Ceze and Steven D. Gribble and Dan
		  R. K. Ports and Henry M. Levy},
  institution = {University of Washington CSE},
  title = {A Study of Virtual Memory Usage and Implications for Large
		  Memory},
  year = {2013},
  abstract = {The mechanisms now used to implement virtual memory -
		  pages, page tables, and TLBs - have worked remarkably well
		  for over fifty years. However, they are beginning to show
		  their age due to current trends, such as significant
		  increases in physical memory size, emerging data-intensive
		  applications, and imminent non-volatile main memory. These
		  trends call into question whether page-based
		  address-translation and protection mechanisms remain viable
		  solutions in the future. In this paper, we present a
		  detailed study of how modern applications use virtual
		  memory. Among other topics, our study examines the
		  footprint of mapped regions, the use of memory protection,
		  and the overhead of TLBs. Our results suggest that a
		  segment-based translation mechanism, together with a
		  fine-grained protection mechanism, merit consideration for
		  future systems.},
  pdf = {papers/vmstudy-tr13.pdf},
  monthnum = {}
}
@techreport{peter13:_arrak,
  address = {Seattle, WA, USA},
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Arvind Krishnamurthy and Thomas Anderson and
		  Timothy Roscoe},
  institution = {University of Washington CSE},
  month = oct,
  number = {UW-CSE-13-10-01},
  title = {Arrakis: The Operating System is the Control Plane},
  year = {2013},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network servers. In a traditional operating
		  system, the kernel mediates access to device hardware by
		  server applications, to enforce process isolation as well
		  as network and disk security. We have designed and
		  implemented a new operating system, Arrakis, that splits
		  the traditional role of the kernel in two. Applications
		  have direct access to virtualized I/O devices, allowing
		  most I/O operations to skip the kernel entirely. The
		  Arrakis kernel operates only in the control plane. We
		  describe the the hardware and software changes needed to
		  take advantage of this new abstraction, and we illustrate
		  its power by showing significant latency and throughput
		  improvements for network server applications relative to a
		  well-tuned Linux implementation.},
  pdf = {papers/arrakis-tr.pdf},
  supersededby = {peter14:_arrak},
  supersededas = {Tech. Report (v1.0, 2013)},
  monthnum = {10}
}
@phdthesis{ports12:_applic_level_cachin_trans_consis,
  address = {Cambridge, MA, USA},
  author = {Dan R. K. Ports},
  month = jun,
  school = {Massachusetts Institute of Technology},
  type = {Ph.D. thesis},
  title = {Application-Level Caching with Transactional Consistency},
  year = {2012},
  abstract = {Distributed in-memory application data caches like
		  memcached are a popular solution for scaling
		  database-driven web sites. These systems increase
		  performance significantly by reducing load on both the
		  database and application servers. Unfortunately, such
		  caches present two challenges for application developers.
		  First, they cannot ensure that the application sees a
		  consistent view of the data within a transaction, violating
		  the isolation properties of the underlying database.
		  Second, they leave the application responsible for locating
		  data in the cache and keeping it up to date, a frequent
		  source of application complexity and programming errors.
		  

This thesis addresses both of these problems in a new cache called TxCache. TxCache is a transactional cache: it ensures that any data seen within a transaction, whether from the cache or the database, reflects a slightly stale but consistent snapshot of the database. TxCache also offers a simple programming model. Application developers simply designate certain functions as cacheable, and the system automatically caches their results and invalidates the cached data as the underlying database changes.

Our experiments found that TxCache can substantially increase the performance of a web application: on the RUBiS benchmark, it increases throughput by up to 5.2x relative to a system without caching. More importantly, on this application, TxCache achieves performance comparable (within 5\%) to that of a non-transactional cache, showing that consistency does not have to come at the price of performance.}, pdf = {papers/thesis.pdf}, monthnum = {06} }

@article{ports12:_serial_snaps_isolat_postg,
  author = {Dan R. K. Ports and Kevin Grittner},
  journal = {Proceedings of the VLDB Endowment},
  month = aug,
  number = {12},
  pages = {1850--1861},
  title = {Serializable Snapshot Isolation in {PostgreSQL}},
  volume = {5},
  year = {2012},
  abstract = {This paper describes our experience implementing
		  PostgreSQL's new serializable isolation level. It is based
		  on the recently-developed Serializable Snapshot Isolation
		  (SSI) technique. This is the first implementation of SSI in
		  a production database release as well as the first in a
		  database that did not previously have a lock-based
		  serializable isolation level. We reflect on our experience
		  and describe how we overcame some of the resulting
		  challenges, including the implementation of a new lock
		  manager, a technique for ensuring memory usage is bounded,
		  and integration with other PostgreSQL features. We also
		  introduce an extension to SSI that improves performance for
		  read-only transactions. We evaluate PostgreSQL's
		  serializable isolation level using several benchmarks and
		  show that it achieves performance only slightly below that
		  of snapshot isolation, and significantly outperforms the
		  traditional two-phase locking approach on read-intensive
		  workloads.},
  pdf = {papers/ssi-vldb12.pdf},
  slidespdf = {papers/ssi-vldb12-slides.pdf},
  code = {https://www.postgresql.org/},
  monthnum = {08}
}

This file was generated by bibtex2html 1.99.