diff --git a/Makefile b/Makefile
index 0ea52ee..523a475 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,7 @@ _includes/pubs.html: bib/pubs.bib bib/publications.tmpl
 	$(BIBBLE) $+ > $@
 
 build: _includes/pubs.html
+	./dblpbibcloud.py
 	jekyll build
 
 # you can configure these at the shell, e.g.:
diff --git a/bib/misc.bib b/bib/misc.bib
new file mode 100644
index 0000000..930dbfd
--- /dev/null
+++ b/bib/misc.bib
@@ -0,0 +1,125 @@
+%%
+%% This file is editable
+%% order is unimporant
+
+
+@article{Sirin:284817,
+      title = {Micro-architectural Analysis of In-memory OLTP: Revisited},
+      author = {Sirin, Utku and Tözün, Pınar and Porobic, Danica and  Yasin, Ahmad and Ailamaki, Anastasia},
+      journal = {The VLDB Journal},
+      number = {4},
+      volume = {30},
+      venue_short = {VLDBJ '21},
+      year = {2021},
+      abstract = {Micro-architectural behavior of traditional disk-based  online transaction processing (OLTP) systems has been  investigated extensively over the past couple of decades.  Results show that traditional OLTP systems mostly  under-utilize the available micro-architectural resources.  In-memory OLTP systems, on the other hand, process all the  data in main-memory and, therefore, can omit the buffer  pool. Furthermore, they usually adopt more lightweight  concurrency control mechanisms, cache-conscious data  structures, and cleaner codebases since they are usually  designed from scratch. Hence, we expect significant  differences in micro-architectural behavior when running  OLTP on platforms optimized for in-memory processing as  opposed to disk-based database systems. In particular, we  expect that in-memory systems exploit micro-architectural  features such as instruction and data caches significantly  better than disk-based systems. This paper sheds light on  the micro-architectural behavior of in-memory database  systems by analyzing and contrasting it to the behavior of  disk-based systems when running OLTP workloads. The results  show that, despite all the design changes, in-memory OLTP  exhibits very similar micro-architectural behavior to  disk-based OLTP: more than half of the execution time goes  to memory stalls where instruction cache misses or the  long-latency data misses from the last-level cache (LLC)  are the dominant factors in the overall execution time.  Even though ground-up designed in-memory systems can  eliminate the instruction cache misses, the reduction in  instruction stalls amplifies the impact of LLC data misses.  As a result, only 30% of the CPU cycles are used to retire  instructions, and 70% of the CPU cycles are wasted to  stalls for both traditional disk-based and new generation  in-memory OLTP.},
+      url = {http://infoscience.epfl.ch/record/284817},
+      doi = {10.1007/s00778-021-00663-8},
+}
+
+@article{Dauterman:287869,
+      title = {SafetyPin: Encrypted Backups with Human-Memorable Secrets},
+      author = {Dauterman, Emma and Corrigan-Gibbs, Henry and Mazieres,  David},
+      publisher = {USENIX ASSOC},
+      journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)},
+      address = {Berkeley},
+      pages = {1121-1138},
+      venue_short = {OSDI '20},
+      year = {2020},
+      abstract = {We present the design and implementation of SafetyPin, a  system for encrypted mobile-device backups. Like existing  cloud-based mobile-backup systems, including those of Apple  and Google, SafetyPin requires users to remember only a  short PIN and defends against brute-force PIN-guessing  attacks using hardware security protections. Unlike today's  systems, SafetyPin splits trust over a cluster of hardware  security modules (HSMs) in order to provide security  guarantees that scale with the number of HSMs. In this way,  SafetyPin protects backed-up user data even against an  attacker that can adaptively compromise many of the  system's constituent HSMs. SafetyPin provides this  protection without sacrificing scalability or fault  tolerance. Decentralizing trust while respecting the  resource limits of today's HSMs requires a synthesis of  systems-design principles and cryptographic tools. We  evaluate SafetyPin on a cluster of 100 low-cost HSMs and  show that a SafetyPin-protected recovery takes 1.01  seconds. To process 1B recoveries a venue_short, we estimate that  a SafetyPin deployment would need 3,100 low-cost HSMs.},
+      url = {http://infoscience.epfl.ch/record/287869},
+}
+
+
+@article{Cohn-Gordon:287859,
+      title = {DELF: Safeguarding Deletion Correctness in Online Social  Networks},
+      author = {Cohn-Gordon, Katriel and Damaskinos, Georgios and Neto,  Divino and Cordova, Shi and Reitz, Benoit and Strahs,  Benjamin and Obenshain, Daniel and Pearce, Paul and  Papagiannis, Loannis},
+      publisher = {USENIX ASSOC},
+      journal = {Proceedings Of The 29th Usenix Security Symposium},
+      address = {Berkeley},
+      pages = {1057-1074},
+      venue_short = {USENIX Security '20},
+      year = {2020},
+      abstract = {Deletion is a core facet of Online Social Networks (OSNs).  For users, deletion is a tool to remove what they have  shared and control their data. For OSNs, robust deletion is  both an obligation to their users and a risk when developer  mistakes inevitably occur. While developers are effective  at identifying high-level deletion requirements in products  (e.g., users should be able to delete posted photos), they  are less effective at mapping high-level requirements into  concrete operations (e.g., deleting all relevant items in  data stores). Without framework support, developer mistakes  lead to violations of users' privacy, such as retaining  data that should be deleted, deleting the wrong data, and  exploitable vulnerabilities.},
+      url = {http://infoscience.epfl.ch/record/287859},
+}
+
+
+@article{Anadiotis:282822,
+      title = {A System Design for Elastically Scaling Transaction  Processing Engines in Virtualized Servers},
+      author = {Anadiotis, Angelos-Christos and Appuswamy, Raja and  Ailamaki, Anastasia and Bronshtein, Ilan and Avni, Hillel  and Dominguez-Sal, David and Goikhman, Shay and Levy,  Eliezer},
+      publisher = {ASSOC COMPUTING MACHINERY},
+      journal = {Proceedings of the VLDB Endowment},
+      address = {New York},
+      number = {12},
+      volume = {13},
+      pages = {3085-3098},
+      venue_short = {VLDB '20},
+      year = {2020},
+      abstract = {Online Transaction Processing (OLTP) deployments are  migrating from on-premise to cloud settings in order to  exploit the elasticity of cloud infrastructure which allows  them to adapt to workload variations. However, cloud  adaptation comes at the cost of redesigning the engine,  which has led to the introduction of several, new,  cloud-based transaction processing systems mainly focusing  on: (i) the transaction coordination protocol, (ii) the  data partitioning strategy, and, (iii) the resource  isolation across multiple tenants. As a result, standalone  OLTP engines cannot be easily deployed with an elastic  setting in the cloud and they need to migrate to another,  specialized deployment.},
+      url = {http://infoscience.epfl.ch/record/282822},
+      doi = {10.14778/3415478.3415536},
+}
+@article{Sirin:280879,
+      title = {Micro-architectural Analysis of OLAP: Limitations and  Opportunities},
+      author = {Sirin, Utku and Ailamaki, Anastasia},
+      publisher = {ASSOC COMPUTING MACHINERY},
+      journal = {Proceedings of the VLDB Endowment},,
+      address = {New York},
+      number = {6},
+      volume = {13},
+      pages = {840-853},
+      venue_short = {VLDB '20},
+      year = {2020},
+      abstract = {Understanding micro-architectural behavior is important  for efficiently using hardware resources. Recent work has  shown that in-memory online transaction processing (OLTP)  systems severely underutilize their core micro-architecture  resources [29]. Whereas, online analytical processing  (OLAP) workloads exhibit a completely different computing  pattern. OLAP workloads are read-only, bandwidth-intensive,  and include various data access patterns. With the rise of  column-stores, they run on high-performance engines that  are tightly optimized for modern hardware. Consequently,  micro-architectural behavior of modern OLAP systems remains  unclear.},
+      url = {http://infoscience.epfl.ch/record/280879},
+      doi = {10.14778/3380750.3380755},
+}
+@article{Olma:275745,
+      title = {Adaptive Partitioning and Indexing for In-situ Query  Processing},
+      author = {Olma, Matthaios and Karpathiotakis, Manos and Alagiannis,  Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia},
+      publisher = {SPRINGER},
+      journal = {The VLDB Journal},
+      address = {New York},
+      number = {1},
+      volume = {29},
+      pages = {569-591},
+      venue_short = {VLDBJ '20},
+      year = {2020},
+      abstract = {The constant flux of data and queries alike has been  pushing the boundaries of data analysis systems. The  increasing size of raw data files has made data loading an  expensive operation that delays the data-to-insight time.  To alleviate the loading cost, in situ query processing  systems operate directly over raw data and offer instant  access to data. At the same time, analytical workloads have  increasing number of queries. Typically, each query focuses  on a constantly shifting-yet small-range. As a result,  minimizing the workload latency requires the benefits of  indexing in in situ query processing. In this paper, we  present an online partitioning and indexing scheme, along  with a partitioning and indexing tuner tailored for in situ  querying engines. The proposed system design improves query  execution time by taking into account user query patterns,  to (i) partition raw data files logically and (ii) build  lightweight partition-specific indexes for each partition.  We build an in situ query engine called Slalom to showcase  the impact of our design. Slalom employs adaptive  partitioning and builds non-obtrusive indexes in different  partitions on-the-fly based on lightweight query access  pattern monitoring. As a result of its lightweight nature,  Slalom achieves efficient query processing over raw data  with minimal memory consumption. Our experimentation with  both microbenchmarks and real-life workloads shows that  Slalom outperforms state-of-the-art in situ engines and  achieves comparable query response times with fully indexed  DBMS, offering lower cumulative query execution times for  query workloads with increasing size and unpredictable  access patterns.},
+      url = {http://infoscience.epfl.ch/record/275745},
+      doi = {10.1007/s00778-019-00580-x},
+}
+
+
+@article{Oh:276919,
+      title = {Linebacker: Preserving Victim Cache Lines in Idle Register  Files of GPUs},
+      author = {Oh, Yunho and Koo, Gunjae and Annavaram, Murali and Ro,  Won Woo},
+      publisher = {ASSOC COMPUTING MACHINERY},
+      journal = {Proceedings Of The 2019 46Th International Symposium On  Computer Architecture (ISCA'19)},
+      address = {New York},
+      pages = {183-196},
+      venue_short = {ISCA '19},
+      year = {2019},
+      abstract = {Modern GPUs suffer from cache contention due to the  limited cache size that is shared across tens of  concurrently running warps. To increase the per-warp cache  size prior techniques proposed warp throttling which limits  the number of active warps. Warp throttling leaves several  registers to be dynamically unused whenever a warp is  throttled. Given the stringent cache size limitation in  GPUs this work proposes a new cache management technique  named Linebacker (LB) that improves GPU performance by  utilizing idle register file space as victim cache space.  Whenever a CTA becomes inactive, linebacker backs up the  registers of the throttled CTA to the off-chip memory.  Then, linebacker utilizes the corresponding register file  space as victim cache space. If any load instruction finds  data in the victim cache line, the data is directly copied  to the destination register through a simple  register-register move operation. To further improve the  efficiency of victim cache linebacker allocates victim  cache space only to a select few load instructions that  exhibit high data locality. Through a careful design of  victim cache indexing and management scheme linebacker  provides 29.0% of speedup compared to the previously  proposed warp throttling techniques.},
+      url = {http://infoscience.epfl.ch/record/276919},
+      doi = {10.1145/3307650.3322222},
+}
+
+
+
+@article{Shamis:273429,
+      title = {Fast General Distributed Transactions with Opacity},
+      author = {Shamis, Alex and Renzelmann, Matthew and Novakovic, Stanko  and Chatzopoulos, Georgios and Dragojevic, Aleksandar and  Narayanan, Dushyanth and Castro, Miguel},
+      publisher = {ASSOC COMPUTING MACHINERY},
+      journal = {Proceedings Of The 2019 International  Conference On Management Of Data (SIGMOD'19)},
+      address = {New York},
+      series = {International Conference on Management of Data},
+      pages = {433-448},
+      venue_short = {SIGMOD '19},
+      year = {2019},
+      abstract = {Transactions can simplify distributed applications by  hiding data distribution, concurrency, and failures from  the application developer. Ideally the developer would see  the abstraction of a single large machine that runs  transactions sequentially and never fails. This requires  the transactional subsystem to provide opacity (strict  serializability for both committed and aborted  transactions), as well as transparent fault tolerance with  high availability. As even the best abstractions are  unlikely to be used if they perform poorly, the system must  also provide high performance. Existing distributed  transactional designs either weaken this abstraction or are  not designed for the best performance within a data center.  This paper extends the design of FaRM - which provides  strict serializability only for committed transactions - to  provide opacity while maintaining FaRM's high throughput,  low latency, and high availability within a modern data  center. It uses timestamp ordering based on real time with  clocks synchronized to within tens of microseconds across a  cluster, and a failover protocol to ensure correctness  across clock master failures. FaRM with opacity can commit  5.4 million neworder transactions per second when running  the TPC-C transaction mix on 90 machines with 3-way  replication.},
+      url = {http://infoscience.epfl.ch/record/273429},
+      doi = {10.1145/3299869.3300069},
+}
+
+
diff --git a/bib/pubs.bib b/bib/pubs.bib
index bfed4d2..7a99105 100644
--- a/bib/pubs.bib
+++ b/bib/pubs.bib
@@ -1,366 +1,282 @@
-
-
+%% DO NOT EDIT
+%% Generated by dblpbibcloud.py
 @article{Ghosn:283299,
-      title = {Enclosure: Language-Based Restriction of Untrusted  Libraries},
-      author = {Ghosn, Adrien and Kogias, Marios and Payer, Mathias and  Larus, James and Bugnion, Edouard},
-      pages = {13},
-      year = {2021},
-      journal = {Proceedings of the 26th ACM International Conference on  Architectural Support for Programming Languages and  Operating Systems},
-      abstract = {Programming languages and systems have failed to address  the security implications of the increasingly frequent use  of public libraries to construct modern software. Most  languages provide tools and online repositories to publish,  import, and use libraries; however, this double-edged sword  can incorporate a large quantity of unknown, unchecked, and  unverified code into an application. The risk is real, as  demonstrated by malevolent actors who have repeatedly  inserted malware into popular open-source libraries. This  paper proposes a solution: enclosures, a new programming  language construct for library isolation that provides a  developer with fine-grain control over the resources that a  library can access, even for libraries with complex  inter-library dependencies. The programming abstraction is  language-independent and could be added to most languages.  These languages would then be able to take advantage of  hardware isolation mechanisms that are effective across  language boundaries. The enclosure policies are enforced at  run time by LitterBox, a language-independent framework  that uses hardware mechanisms to provide uniform and robust  isolation guarantees, even for libraries written in unsafe  languages. LitterBox currently supports both Intel VT-x  (with general-purpose extended page tables) and the  emerging Intel Memory Protection Keys (MPK). We describe an  enclosure implementation for the Go and Python languages.  Our evaluation demonstrates that the Go implementation can  protect sensitive data in real-world applications  constructed using complex untrusted libraries with deep  dependencies. It requires minimal code refactoring and  incurs acceptable performance overhead. The Python  implementation demonstrates LitterBox’s ability to  support dynamic languages.},
-      url = {http://infoscience.epfl.ch/record/283299},
-      venue_short = {ASPLOS '21},
-
-}
+ title={Enclosure: Language-Based Restriction of Untrusted  Libraries},
+ author={Ghosn, Adrien and Kogias, Marios and Payer, Mathias and  Larus, James and Bugnion, Edouard},
+ abstract={Programming languages and systems have failed to address  the security implications of the increasingly frequent use  of public libraries to construct modern software. Most  languages provide tools and online repositories to publish,  import, and use libraries; however, this double-edged sword  can incorporate a large quantity of unknown, unchecked, and  unverified code into an application. The risk is real, as  demonstrated by malevolent actors who have repeatedly  inserted malware into popular open-source libraries. This  paper proposes a solution: enclosures, a new programming  language construct for library isolation that provides a  developer with fine-grain control over the resources that a  library can access, even for libraries with complex  inter-library dependencies. The programming abstraction is  language-independent and could be added to most languages.  These languages would then be able to take advantage of  hardware isolation mechanisms that are effective across  language boundaries. The enclosure policies are enforced at  run time by LitterBox, a language-independent framework  that uses hardware mechanisms to provide uniform and robust  isolation guarantees, even for libraries written in unsafe  languages. LitterBox currently supports both Intel VT-x  (with general-purpose extended page tables) and the  emerging Intel Memory Protection Keys (MPK). We describe an  enclosure implementation for the Go and Python languages.  Our evaluation demonstrates that the Go implementation can  protect sensitive data in real-world applications  constructed using complex untrusted libraries with deep  dependencies. It requires minimal code refactoring and  incurs acceptable performance overhead. The Python  implementation demonstrates LitterBox’s ability to  support dynamic languages.},
+ url={http://infoscience.epfl.ch/record/283299},
+ venue_short={ASPLOS '21}}
 
 
 @article{Ustiugov:283308,
-      title = {Benchmarking, Analysis, and Optimization of Serverless  Function Snapshots},
-      author = {Ustiugov, Dmitrii and Petrov, Plamen and Kogias, Marios  and Bugnion, Edouard and Grot, Boris},
-      pages = {14},
-      year = {2021},
-      journal = {Proceedings of the 26th ACM International Conference on  Architectural Support for Programming Languages and  Operating Systems},
-
-      abstract = {Serverless computing has seen rapid adoption due to its  high scalability and flexible, pay-as-you-go billing model.  In serverless, developers structure their services as a  collection of functions, sporadically invoked by various  events like clicks. High inter-arrival time variability of  function invocations motivates the providers to start new  function instances upon each invocation, leading to  significant cold-start delays that degrade user experience.  To reduce cold-start latency, the industry has turned to  snapshotting, whereby an image of a fully-booted function  is stored on disk, enabling a faster invocation compared to  booting a function from scratch. This work introduces  vHive, an open-source framework for serverless  experimentation with the goal of enabling researchers to  study and innovate across the entire serverless stack.  Using vHive, we characterize a state-of-the-art  snapshot-based serverless infrastructure, based on  industry-leading Containerd orchestration framework and  Firecracker hypervisor technologies. We find that the  execution time of a function started from a snapshot is 95%  higher, on average, than when the same function is memory-  resident. We show that the high latency is attributable to  frequent page faults as the function’s state is brought  from disk into guest memory one page at a time. Our  analysis further reveals that funccloud computing,  datacenters, serverless, virtualization, snapshotstions  access the same stable working set of pages across  different invocations of the same function. By leveraging  this insight, we build REAP, a light-weight software  mechanism for serverless hosts that records functions’  stable working set of guest memory pages and proactively  prefetches it from disk into memory. Compared to baseline  snapshotting, REAP slashes the cold-start delays by 3.7×,  on average.},
-      url = {http://infoscience.epfl.ch/record/283308},
-      venue_short = {ASPLOS '21},
-     
-}
+ title={Benchmarking, Analysis, and Optimization of Serverless  Function Snapshots},
+ author={Ustiugov, Dmitrii and Petrov, Plamen and Kogias, Marios  and Bugnion, Edouard and Grot, Boris},
+ abstract={Serverless computing has seen rapid adoption due to its  high scalability and flexible, pay-as-you-go billing model.  In serverless, developers structure their services as a  collection of functions, sporadically invoked by various  events like clicks. High inter-arrival time variability of  function invocations motivates the providers to start new  function instances upon each invocation, leading to  significant cold-start delays that degrade user experience.  To reduce cold-start latency, the industry has turned to  snapshotting, whereby an image of a fully-booted function  is stored on disk, enabling a faster invocation compared to  booting a function from scratch. This work introduces  vHive, an open-source framework for serverless  experimentation with the goal of enabling researchers to  study and innovate across the entire serverless stack.  Using vHive, we characterize a state-of-the-art  snapshot-based serverless infrastructure, based on  industry-leading Containerd orchestration framework and  Firecracker hypervisor technologies. We find that the  execution time of a function started from a snapshot is 95%  higher, on average, than when the same function is memory-  resident. We show that the high latency is attributable to  frequent page faults as the function’s state is brought  from disk into guest memory one page at a time. Our  analysis further reveals that funccloud computing,  datacenters, serverless, virtualization, snapshotstions  access the same stable working set of pages across  different invocations of the same function. By leveraging  this insight, we build REAP, a light-weight software  mechanism for serverless hosts that records functions’  stable working set of guest memory pages and proactively  prefetches it from disk into memory. Compared to baseline  snapshotting, REAP slashes the cold-start delays by 3.7×,  on average.},
+ url={http://infoscience.epfl.ch/record/283308},
+ venue_short={ASPLOS '21}}
 
-@article{Primorac:283315,
-      title = {When to Hedge in Interactive Services},
-      author = {Primorac, Mia and Argyraki, Katerina and Bugnion, Edouard},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 18Th Usenix Symposium On Networked  System Design And Implementation},
-      address = {Berkeley},
-      pages = {15. 373-388},
-      year = {2021},
-      abstract = {In online data-intensive (OLDI) services, each client  request typically executes on multiple servers in parallel;  as a result, “system hiccups”, although rare within a  single server, can interfere with many client requests and  cause violations of service-level objectives. Service  providers have long been fighting this “tail at scale”  problem through “hedging”, i.e., issuing redundant  queries to mask system hiccups. This, however, can  potentially cause congestion that is more detrimental to  tail latency than the hiccups themselves. This paper asks:  when does it make sense to hedge in OLDI services, and how  can we hedge enough to mask system hiccups but not as much  as to cause congestion? First, we show that there are many  realistic scenarios where hedging can have no  benefit—where any hedging-based scheduling policy,  including the state-of-the-art, yields no latency reduction  compared to optimal load balancing without hedging. Second,  we propose LÆDGE, a scheduling policy that combines  optimal load balancing with work-conserving hedging, and  evaluate it in an AWS cloud deployment. We show that LÆDGE  strikes the right balance: first, unlike the state of the  art, it never causes unnecessary congestion; second, it  performs close to an ideal scheduling policy, improving the  99th percentile latency by as much as 49%, measured on 60%  system utilization—without any difficult parameter  training as found in the state of the art.},
-      url = {http://infoscience.epfl.ch/record/283315},
-      venue_short = {NSDI '21},
-
-}
 
 @article{Gupta:285466,
-      title = {Rebooting Virtual Memory with Midgard},
-      author = {Gupta, Siddharth and Bhattacharyya, Atri and Oh, Yunho and  Bhattacharjee, Abhishek and Falsafi, Babak and Payer,  Mathias},
-      journal = {Proceedings of the 48th International Symposium on  Computer Architecture (ISCA'21)},
-      pages = {14},
-      venue_short = {ISCA '21},
-      year = {2021},
-      abstract = {Computer systems designers are building cache hierarchies  with higher capacity to capture the ever-increasing working  sets of modern workloads. Cache hierarchies with higher  capacity improve system performance but shift the  performance bottleneck to address translation. We propose  Midgard, an intermediate address space between the virtual  and the physical address spaces, to mitigate address  translation overheads without program-level changes.  Midgard leverages the operating system concept of virtual  memory areas (VMAs) to realize a single Midgard address  space where VMAs of all processes can be uniquely mapped.  The Midgard address space serves as the namespace for all  data in a coherence domain and the cache hierarchy. Because  real-world workloads use far fewer VMAs than pages to  represent their virtual address space, virtual to Midgard  translation is achieved with hardware structures that are  much smaller than TLB hierarchies. Costlier Midgard to  physical address translations are needed only on LLC  misses, which become much less frequent with larger caches.  As a consequence, Midgard shows that instead of amplifying  address translation overheads, memory hierarchies with  large caches can reduce address translation overheads. Our  evaluation shows that Midgard achieves only 5% higher  address translation overhead as compared to traditional TLB  hierarchies for 4KB pages when using a 16MB aggregate LLC.  Midgard also breaks even with traditional TLB hierarchies  for 2MB pages when using a 256MB aggregate LLC. For cache  hierarchies with higher capacity, Midgard's address  translation overhead drops to near zero as secondary and  tertiary data working sets fit in the LLC, while  traditional TLBs suffer even higher degrees of address  translation overhead.},
-      url = {http://infoscience.epfl.ch/record/285466},
-      doi = {10.1109/ISCA52012.2021.00047},
-}
+ title={Rebooting Virtual Memory with Midgard},
+ author={Gupta, Siddharth and Bhattacharyya, Atri and Oh, Yunho and  Bhattacharjee, Abhishek and Falsafi, Babak and Payer,  Mathias},
+ abstract={Computer systems designers are building cache hierarchies  with higher capacity to capture the ever-increasing working  sets of modern workloads. Cache hierarchies with higher  capacity improve system performance but shift the  performance bottleneck to address translation. We propose  Midgard, an intermediate address space between the virtual  and the physical address spaces, to mitigate address  translation overheads without program-level changes.  Midgard leverages the operating system concept of virtual  memory areas (VMAs) to realize a single Midgard address  space where VMAs of all processes can be uniquely mapped.  The Midgard address space serves as the namespace for all  data in a coherence domain and the cache hierarchy. Because  real-world workloads use far fewer VMAs than pages to  represent their virtual address space, virtual to Midgard  translation is achieved with hardware structures that are  much smaller than TLB hierarchies. Costlier Midgard to  physical address translations are needed only on LLC  misses, which become much less frequent with larger caches.  As a consequence, Midgard shows that instead of amplifying  address translation overheads, memory hierarchies with  large caches can reduce address translation overheads. Our  evaluation shows that Midgard achieves only 5% higher  address translation overhead as compared to traditional TLB  hierarchies for 4KB pages when using a 16MB aggregate LLC.  Midgard also breaks even with traditional TLB hierarchies  for 2MB pages when using a 256MB aggregate LLC. For cache  hierarchies with higher capacity, Midgard's address  translation overhead drops to near zero as secondary and  tertiary data working sets fit in the LLC, while  traditional TLBs suffer even higher degrees of address  translation overhead.},
+ url={http://infoscience.epfl.ch/record/285466},
+ venue_short={ISCA '21}}
+
+
 @article{PourhabibiZarandi:288432,
-      title = {Cerebros: Evading the RPC Tax in Datacenters},
-      author = {Pourhabibi Zarandi, Arash and Sutherland, Mark Johnathon  and Daglis, Alexandros and Falsafi, Babak},
-      journal = {Proceedings of the 54th International Symposium on  Microarchitecture (MICRO'21)},
-      pages = {407-420},
-      venue_short = {MICRO '21},
-      year = {2021},
-      abstract = {The emerging paradigm of microservices decomposes online  services into fine-grained software modules frequently  communicating over the datacenter network, often using  Remote Procedure Calls (RPCs). Ongoing advancements in the  network stack have exposed the RPC layer itself as a  bottleneck, that we show accounts for 40–90% of a  microservice's total execution cycles. We break down the  underlying modules that comprise production RPC layers and  demonstrate, based on prior evidence, that CPUs can only  expect limited improvements for such tasks, mandating a  shift to hardware to remove the RPC layer as a limiter of  microservice performance. Although recently proposed  accelerators can efficiently handle a portion of the RPC  layer, their overall benefit is limited by unnecessary CPU  involvement, which occurs because the accelerators are  architected as co-processors under the CPU's control.  Instead, we show that conclusively removing the RPC layer  bottleneck requires all of the RPC layer's modules to be  executed by a NIC-attached hardware accelerator. We  introduce Cerebros, a dedicated RPC processor that executes  the Apache Thrift RPC layer and acts as an intermediary  stage between the NIC and the microservice running on the  CPU. Our evaluation using the DeathStarBench microservice  suite shows that Cerebros reduces the CPU cycles spent in  the RPC layer by 37–64×, yielding a 1.8–14× reduction  in total cycles expended per microservice request.},
-      url = {http://infoscience.epfl.ch/record/288432},
-      doi = {10.1145/3466752.3480055},
-}
+ title={Cerebros: Evading the RPC Tax in Datacenters},
+ author={Pourhabibi Zarandi, Arash and Sutherland, Mark Johnathon  and Daglis, Alexandros and Falsafi, Babak},
+ abstract={The emerging paradigm of microservices decomposes online  services into fine-grained software modules frequently  communicating over the datacenter network, often using  Remote Procedure Calls (RPCs). Ongoing advancements in the  network stack have exposed the RPC layer itself as a  bottleneck, that we show accounts for 40–90% of a  microservice's total execution cycles. We break down the  underlying modules that comprise production RPC layers and  demonstrate, based on prior evidence, that CPUs can only  expect limited improvements for such tasks, mandating a  shift to hardware to remove the RPC layer as a limiter of  microservice performance. Although recently proposed  accelerators can efficiently handle a portion of the RPC  layer, their overall benefit is limited by unnecessary CPU  involvement, which occurs because the accelerators are  architected as co-processors under the CPU's control.  Instead, we show that conclusively removing the RPC layer  bottleneck requires all of the RPC layer's modules to be  executed by a NIC-attached hardware accelerator. We  introduce Cerebros, a dedicated RPC processor that executes  the Apache Thrift RPC layer and acts as an intermediary  stage between the NIC and the microservice running on the  CPU. Our evaluation using the DeathStarBench microservice  suite shows that Cerebros reduces the CPU cycles spent in  the RPC layer by 37–64×, yielding a 1.8–14× reduction  in total cycles expended per microservice request.},
+ url={http://infoscience.epfl.ch/record/288432},
+ venue_short={MICRO '21}}
+
+
 @article{DrumondLagesDeOliveira:288715,
-      title = {Equinox: Training (for Free) on a Custom Inference  Accelerator},
-      author = {Drumond Lages De Oliveira, Mario Paulo and Coulon, Louis  and Pourhabibi Zarandi, Arash and Yüzügüler, Ahmet Caner  and Falsafi, Babak and Jaggi, Martin},
-      publisher = {ACM},
-      journal = {Proceedings of the 54th International Symposium on  Microarchitecture (MICRO'21)},
-      venue_short = {MICRO '21},
-      year = {2021},
-      abstract = {DNN inference accelerators executing online services  exhibit low average loads because of service demand  variability, leading to poor resource utilization.  Unfortunately, reclaiming idle inference cycles is  difficult as other workloads can not execute on a custom  accelerator. With recent proposals for the use of  fixed-point arithmetic in training, there are opportunities  for training services to piggyback on inference  accelerators. We make the observation that a key challenge  in doing so is maintaining service-level latency  constraints for inference. We show that relaxing latency  constraints in an inference accelerator with ALU arrays  that are batching-optimized achieves near-optimal  throughput for a given area and power envelope while  maintaining inference services' tail latency goals. We  present Equinox, a custom inference accelerator designed to  piggyback training. Equinox employs a uniform arithmetic  encoding to accommodate inference and training and a  priority hardware scheduler with adaptive batching that  interleaves training during idle inference cycles. For  a500𝜇𝑠 inference service time constraint, Equinox  achieves 6.67× higher throughput than a latency-optimal  inference accelerator. Despite not being optimized for  training services, Equinox achieves up to 78% of the  throughput of a dedicated training accelerator that  saturates the available compute resources and DRAM  bandwidth. Finally, Equinox’s controller logic incurs  less than 1% power and area overhead, while the uniform  encoding (to enable training) incurs 13% power and 4% area  overhead compared to a fixed-point inference accelerator.},
-      url = {http://infoscience.epfl.ch/record/288715},
-      doi = {10.1145/3466752.3480057},
-}
+ title={Equinox: Training (for Free) on a Custom Inference  Accelerator},
+ author={Drumond Lages De Oliveira, Mario Paulo and Coulon, Louis  and Pourhabibi Zarandi, Arash and Yüzügüler, Ahmet Caner  and Falsafi, Babak and Jaggi, Martin},
+ abstract={DNN inference accelerators executing online services  exhibit low average loads because of service demand  variability, leading to poor resource utilization.  Unfortunately, reclaiming idle inference cycles is  difficult as other workloads can not execute on a custom  accelerator. With recent proposals for the use of  fixed-point arithmetic in training, there are opportunities  for training services to piggyback on inference  accelerators. We make the observation that a key challenge  in doing so is maintaining service-level latency  constraints for inference. We show that relaxing latency  constraints in an inference accelerator with ALU arrays  that are batching-optimized achieves near-optimal  throughput for a given area and power envelope while  maintaining inference services' tail latency goals. We  present Equinox, a custom inference accelerator designed to  piggyback training. Equinox employs a uniform arithmetic  encoding to accommodate inference and training and a  priority hardware scheduler with adaptive batching that  interleaves training during idle inference cycles. For  a500𝜇𝑠 inference service time constraint, Equinox  achieves 6.67× higher throughput than a latency-optimal  inference accelerator. Despite not being optimized for  training services, Equinox achieves up to 78% of the  throughput of a dedicated training accelerator that  saturates the available compute resources and DRAM  bandwidth. Finally, Equinox’s controller logic incurs  less than 1% power and area overhead, while the uniform  encoding (to enable training) incurs 13% power and 4% area  overhead compared to a fixed-point inference accelerator.},
+ url={http://infoscience.epfl.ch/record/288715},
+ venue_short={MICRO '21}}
+
+
+@article{Primorac:283315,
+ title={When to Hedge in Interactive Services},
+ author={Primorac, Mia and Argyraki, Katerina and Bugnion, Edouard},
+ abstract={In online data-intensive (OLDI) services, each client  request typically executes on multiple servers in parallel;  as a result, “system hiccups”, although rare within a  single server, can interfere with many client requests and  cause violations of service-level objectives. Service  providers have long been fighting this “tail at scale”  problem through “hedging”, i.e., issuing redundant  queries to mask system hiccups. This, however, can  potentially cause congestion that is more detrimental to  tail latency than the hiccups themselves. This paper asks:  when does it make sense to hedge in OLDI services, and how  can we hedge enough to mask system hiccups but not as much  as to cause congestion? First, we show that there are many  realistic scenarios where hedging can have no  benefit—where any hedging-based scheduling policy,  including the state-of-the-art, yields no latency reduction  compared to optimal load balancing without hedging. Second,  we propose LÆDGE, a scheduling policy that combines  optimal load balancing with work-conserving hedging, and  evaluate it in an AWS cloud deployment. We show that LÆDGE  strikes the right balance: first, unlike the state of the  art, it never causes unnecessary congestion; second, it  performs close to an ideal scheduling policy, improving the  99th percentile latency by as much as 49%, measured on 60%  system utilization—without any difficult parameter  training as found in the state of the art.},
+ url={http://infoscience.epfl.ch/record/283315},
+ venue_short={NSDI '21}}
+
+
 @article{Bhardwaj:289218,
-      title = {NrOS: Effective Replication and Sharing in an Operating  System},
-      author = {Bhardwaj, Ankit and Kulkarni, Chinmay and Achermann, Reto  and Calciu, Irina and Kashyap, Sanidhya and Stutsman, Ryan  and Tai, Amy and Zellweger, Gerd},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 15Th Usenix Symposium On Operating  Systems Design And Implementation (OSDI'21)},
-      address = {Berkeley},
-      pages = {295-312},
-      venue_short = {OSDI '21},
-      year = {2021},
-      abstract = {Writing a correct operating system kernel is notoriously  hard. Kernel code requires manual memory management and  type-unsafe code and must efficiently handle complex,  asynchronous events. In addition, increasing CPU core  counts further complicate kernel development. Typically,  monolithic kernels share state across cores and rely on  one-off synchronization patterns that are specialized for  each kernel structure or subsystem. Hence, kernel  developers are constantly refining synchronization within  OS kernels to improve scalability at the risk of  introducing subtle bugs.},
-      url = {http://infoscience.epfl.ch/record/289218},
-}
+ title={NrOS: Effective Replication and Sharing in an Operating  System},
+ author={Bhardwaj, Ankit and Kulkarni, Chinmay and Achermann, Reto  and Calciu, Irina and Kashyap, Sanidhya and Stutsman, Ryan  and Tai, Amy and Zellweger, Gerd},
+ abstract={Writing a correct operating system kernel is notoriously  hard. Kernel code requires manual memory management and  type-unsafe code and must efficiently handle complex,  asynchronous events. In addition, increasing CPU core  counts further complicate kernel development. Typically,  monolithic kernels share state across cores and rely on  one-off synchronization patterns that are specialized for  each kernel structure or subsystem. Hence, kernel  developers are constantly refining synchronization within  OS kernels to improve scalability at the risk of  introducing subtle bugs.},
+ url={http://infoscience.epfl.ch/record/289218},
+ venue_short={OSDI '21}}
+
+
 @article{Sioulas:285140,
-      title = {Scalable Multi-Query Execution using Reinforcement  Learning},
-      author = {Sioulas, Panagiotis and Ailamaki, Anastasia},
-      publisher = {Association for Computing Machinery},
-      journal = {Proceedings of the 2021 International Conference on  Management of Data (SIGMOD'21)},
-      address = {New York},
-      pages = {13},
-      venue_short = {SIGMOD '21},
-      year = {2021},
-      abstract = {The growing demand for data-intensive decision support and  the migration to multi-tenant infrastructures put databases  under the stress of high analytical query load. The  requirement for high throughput contradicts the traditional  design of query-at-a-time databases that optimize queries  for efficient serial execution. Sharing work across queries  presents an opportunity to reduce the total cost of  processing and therefore improve throughput with increasing  query load. Systems can share work either by assessing all  opportunities and restructuring batches of queries ahead of  execution, or by inspecting opportunities in individual  incoming queries at runtime: the former strategy scales  poorly to large query counts, as it requires expensive  sharing-aware optimization, whereas the latter detects only  a subset of the opportunities. Both strategies fail to  minimize the cost of processing for large and ad-hoc  workloads. This paper presents RouLette, a specialized  intelligent engine for multi-query execution that  addresses, through runtime adaptation, the shortcomings of  existing work-sharing strategies. RouLette scales by  replacing sharing-aware optimization with adaptive query  processing, and it chooses opportunities to explore and  exploit by using reinforcement learning. RouLette also  includes optimizations that reduce the adaptation overhead.  RouLette increases throughput by 1.6-28.3x, compared to a  state-of-the-art query-at-a-time engine, and up to 6.5x,  compared to sharing-enabled prototypes, for multi-query  workloads based on the schema of TPC-DS.},
-      url = {http://infoscience.epfl.ch/record/285140},
-      doi = {10.1145/3448016.3452799},
-}
-@article{KokorisKogias:287444,
-      title = {CALYPSO: Private Data Management for Decentralized  Ledgers},
-      author = {Kokoris Kogias, Eleftherios and Alp, Enis Ceyhun and  Gasser, Linus and Jovanovic, Philipp Svetolik and Syta, Ewa  and Ford, Bryan Alexander},
-      journal = {Proceedings of the VLDB Endowment},
-      number = {4},
-      volume = {14},
-      pages = {586-599},
-      venue_short = {VLDB '21},
-      year = {2021},
-      abstract = {Distributed ledgers provide high availability and  integrity, making them a key enabler for practical and  secure computation of distributed workloads among mutually  distrustful parties. Many practical applications also  require strong confidentiality, however. This work enhances  permissioned and permissionless blockchains with the  ability to manage confidential data without forfeiting  availability or decentralization. The proposed Calypso  architecture addresses two orthogonal challenges  confronting modern distributed ledgers: (a) enabling the  auditable management of secrets and (b) protecting  distributed computations against arbitrage attacks when  their results depend on the ordering and secrecy of inputs.  Calypso introduces on-chain secrets, a novel abstraction  that enforces atomic deposition of an auditable trace  whenever users access confidential data. Calypso provides  user-controlled consent management that ensures revocation  atomicity and accountable anonymity. To enable  permissionless deployment, we introduce an incentive scheme  and provide users with the option to select their preferred  trustees. We evaluated our Calypso prototype with a  confidential document-sharing application and a  decentralized lottery. Our benchmarks show that  transaction-processing latency increases linearly in terms  of security (number of trustees) and is in the range of 0.2  to 8 seconds for 16 to 128 trustees.},
-      url = {http://infoscience.epfl.ch/record/287444},
-      doi = {10.14778/3436905.3436917},
-}
+ title={Scalable Multi-Query Execution using Reinforcement  Learning},
+ author={Sioulas, Panagiotis and Ailamaki, Anastasia},
+ abstract={The growing demand for data-intensive decision support and  the migration to multi-tenant infrastructures put databases  under the stress of high analytical query load. The  requirement for high throughput contradicts the traditional  design of query-at-a-time databases that optimize queries  for efficient serial execution. Sharing work across queries  presents an opportunity to reduce the total cost of  processing and therefore improve throughput with increasing  query load. Systems can share work either by assessing all  opportunities and restructuring batches of queries ahead of  execution, or by inspecting opportunities in individual  incoming queries at runtime: the former strategy scales  poorly to large query counts, as it requires expensive  sharing-aware optimization, whereas the latter detects only  a subset of the opportunities. Both strategies fail to  minimize the cost of processing for large and ad-hoc  workloads. This paper presents RouLette, a specialized  intelligent engine for multi-query execution that  addresses, through runtime adaptation, the shortcomings of  existing work-sharing strategies. RouLette scales by  replacing sharing-aware optimization with adaptive query  processing, and it chooses opportunities to explore and  exploit by using reinforcement learning. RouLette also  includes optimizations that reduce the adaptation overhead.  RouLette increases throughput by 1.6-28.3x, compared to a  state-of-the-art query-at-a-time engine, and up to 6.5x,  compared to sharing-enabled prototypes, for multi-query  workloads based on the schema of TPC-DS.},
+ url={http://infoscience.epfl.ch/record/285140},
+ venue_short={SIGMOD '21}}
+
+
 @article{Sirin:284817,
-      title = {Micro-architectural Analysis of In-memory OLTP: Revisited},
-      author = {Sirin, Utku and Tözün, Pınar and Porobic, Danica and  Yasin, Ahmad and Ailamaki, Anastasia},
-      journal = {The VLDB Journal},
-      number = {4},
-      volume = {30},
-      venue_short = {VLDBJ '21},
-      year = {2021},
-      abstract = {Micro-architectural behavior of traditional disk-based  online transaction processing (OLTP) systems has been  investigated extensively over the past couple of decades.  Results show that traditional OLTP systems mostly  under-utilize the available micro-architectural resources.  In-memory OLTP systems, on the other hand, process all the  data in main-memory and, therefore, can omit the buffer  pool. Furthermore, they usually adopt more lightweight  concurrency control mechanisms, cache-conscious data  structures, and cleaner codebases since they are usually  designed from scratch. Hence, we expect significant  differences in micro-architectural behavior when running  OLTP on platforms optimized for in-memory processing as  opposed to disk-based database systems. In particular, we  expect that in-memory systems exploit micro-architectural  features such as instruction and data caches significantly  better than disk-based systems. This paper sheds light on  the micro-architectural behavior of in-memory database  systems by analyzing and contrasting it to the behavior of  disk-based systems when running OLTP workloads. The results  show that, despite all the design changes, in-memory OLTP  exhibits very similar micro-architectural behavior to  disk-based OLTP: more than half of the execution time goes  to memory stalls where instruction cache misses or the  long-latency data misses from the last-level cache (LLC)  are the dominant factors in the overall execution time.  Even though ground-up designed in-memory systems can  eliminate the instruction cache misses, the reduction in  instruction stalls amplifies the impact of LLC data misses.  As a result, only 30% of the CPU cycles are used to retire  instructions, and 70% of the CPU cycles are wasted to  stalls for both traditional disk-based and new generation  in-memory OLTP.},
-      url = {http://infoscience.epfl.ch/record/284817},
-      doi = {10.1007/s00778-021-00663-8},
-}
-@article{PourhabibiZarandi:274129,
-      title = {Optimus Prime: Accelerating Data Transformation in  Servers},
-      author = {Pourhabibi Zarandi, Arash and Gupta, Siddharth and Kassir,  Hussein and Sutherland, Mark Johnathon and Tian, Zilu and  Drumond Lages De Oliveira, Mario Paulo and Falsafi, Babak  and Koch, Christoph},
-      publisher = {ACM},
-      journal = {Proceedings of the Twenty-Fifth International Conference  on Architectural Support for Programming Languages and  Operating Systems (ASPLOS'20)},
-      address = {New York, NY, USA},
-      pages = {14. 1203-1216},
-      venue_short = {ASPLOS '20},
-      year = {2020},
-      abstract = {Modern online services are shifting away from monolithic  applications to loosely-coupled microservices because of  their improved scalability, reliability, programmability  and development velocity. Microservices communicating over  the datacenter network require data transformation (DT) to  convert messages back and forth between their internal  formats. This work identifies DT as a bottleneck due to  reductions in latency of the surrounding system components,  namely application runtimes, protocol stacks, and network  hardware. We therefore propose Optimus Prime (OP), a  programmable DT accelerator that uses a novel abstraction,  an in-memory schema, to represent DT operations. The schema  is compatible with today's DT frameworks and enables any  compliant accelerator to perform the transformations  comprising a request in parallel. Our evaluation shows that  OP's DT throughput matches the line rate of today's NICs  and has 60x higher throughput compared to software, at a  tiny fraction of the CPU's silicon area and power. We also  evaluate a set of microservices running on Thrift, and show  up to 30% reduction in service latency.},
-      url = {http://infoscience.epfl.ch/record/274129},
-      doi = {10.1145/3373376.3378501},
-}
-@article{Aguilera:287833,
-      title = {Microsecond Consensus for Microsecond Applications},
-      author = {Aguilera, Marcos K. and Ben-David, Naama and Guerraoui,  Rachid and Marathe, Virendra J. and Xygkis, Athanasios and  Zablotchi, Igor},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 14th Usenix Symposium On Operating  Systems Design And Implementation (OSDI'20)},
-      address = {Berkeley},
-      pages = {599-616},
-      venue_short = {OSDI '20},
-      year = {2020},
-      abstract = {We consider the problem of making apps fault-tolerant  through replication, when apps operate at the microsecond  scale, as in finance, embedded computing, and microservices  apps. These apps need a replication scheme that also  operates at the microsecond scale, otherwise replication  becomes a burden. We propose Mu, a system that takes less  than 1.3 microseconds to replicate a (small) request in  memory, and less than a millisecond to fail-over the  system-this cuts the replication and fail-over latencies of  the prior systems by at least 61% and 90%. Mu implements  bona fide state machine replication/consensus (SMR) with  strong consistency for a generic app, but it really shines  on microsecond apps, where even the smallest overhead is  significant. To provide this performance, Mu introduces a  new SMR protocol that carefully leverages RDMA. Roughly, in  Mu a leader replicates a request by simply writing it  directly to the log of other replicas using RDMA, without  any additional communication. Doing so, however, introduces  the challenge of handling concurrent leaders, changing  leaders, garbage collecting the logs, and more-challenges  that we address in this paper through a judicious  combination of RDMA permissions and distributed algorithmic  design. We implemented Mu and used it to replicate several  systems: a financial exchange app called Liquibook, Redis,  Memcached, and HERD [33]. Our evaluation shows that Mu  incurs a small replication latency, in some cases being the  only viable replication system that incurs an acceptable  overhead.},
-      url = {http://infoscience.epfl.ch/record/287833},
-}
+ title={Micro-architectural Analysis of In-memory OLTP: Revisited},
+ author={Sirin, Utku and Tözün, Pınar and Porobic, Danica and  Yasin, Ahmad and Ailamaki, Anastasia},
+ abstract={Micro-architectural behavior of traditional disk-based  online transaction processing (OLTP) systems has been  investigated extensively over the past couple of decades.  Results show that traditional OLTP systems mostly  under-utilize the available micro-architectural resources.  In-memory OLTP systems, on the other hand, process all the  data in main-memory and, therefore, can omit the buffer  pool. Furthermore, they usually adopt more lightweight  concurrency control mechanisms, cache-conscious data  structures, and cleaner codebases since they are usually  designed from scratch. Hence, we expect significant  differences in micro-architectural behavior when running  OLTP on platforms optimized for in-memory processing as  opposed to disk-based database systems. In particular, we  expect that in-memory systems exploit micro-architectural  features such as instruction and data caches significantly  better than disk-based systems. This paper sheds light on  the micro-architectural behavior of in-memory database  systems by analyzing and contrasting it to the behavior of  disk-based systems when running OLTP workloads. The results  show that, despite all the design changes, in-memory OLTP  exhibits very similar micro-architectural behavior to  disk-based OLTP: more than half of the execution time goes  to memory stalls where instruction cache misses or the  long-latency data misses from the last-level cache (LLC)  are the dominant factors in the overall execution time.  Even though ground-up designed in-memory systems can  eliminate the instruction cache misses, the reduction in  instruction stalls amplifies the impact of LLC data misses.  As a result, only 30% of the CPU cycles are used to retire  instructions, and 70% of the CPU cycles are wasted to  stalls for both traditional disk-based and new generation  in-memory OLTP.},
+ url={http://infoscience.epfl.ch/record/284817},
+ venue_short={VLDBJ '21}}
+
+
 @article{Dauterman:287869,
-      title = {SafetyPin: Encrypted Backups with Human-Memorable Secrets},
-      author = {Dauterman, Emma and Corrigan-Gibbs, Henry and Mazieres,  David},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)},
-      address = {Berkeley},
-      pages = {1121-1138},
-      venue_short = {OSDI '20},
-      year = {2020},
-      abstract = {We present the design and implementation of SafetyPin, a  system for encrypted mobile-device backups. Like existing  cloud-based mobile-backup systems, including those of Apple  and Google, SafetyPin requires users to remember only a  short PIN and defends against brute-force PIN-guessing  attacks using hardware security protections. Unlike today's  systems, SafetyPin splits trust over a cluster of hardware  security modules (HSMs) in order to provide security  guarantees that scale with the number of HSMs. In this way,  SafetyPin protects backed-up user data even against an  attacker that can adaptively compromise many of the  system's constituent HSMs. SafetyPin provides this  protection without sacrificing scalability or fault  tolerance. Decentralizing trust while respecting the  resource limits of today's HSMs requires a synthesis of  systems-design principles and cryptographic tools. We  evaluate SafetyPin on a cluster of 100 low-cost HSMs and  show that a SafetyPin-protected recovery takes 1.01  seconds. To process 1B recoveries a venue_short, we estimate that  a SafetyPin deployment would need 3,100 low-cost HSMs.},
-      url = {http://infoscience.epfl.ch/record/287869},
-}
-@article{Pirelli:282923,
-      title = {A Simpler and Faster NIC Driver Model for Network  Functions},
-      author = {Pirelli, Solal and Candea, George},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)},
-      address = {Berkeley},
-      venue_short = {OSDI '20},
-      year = {2020},
-      abstract = {The advent of software network functions calls for  stronger correctness guarantees and higher performance at  every level of the stack. Current network stacks trade  simplicity for performance and flexibility, especially in  their driver model. We show that performance and simplicity  can coexist, at the cost of some flexibility, with a new  NIC driver model tailored to network functions. The key  idea behind our model is that the driver can efficiently  reuse packet buffers because buffers follow a single  logical path. We implement a driver for the Intel 82599  network card in 550 lines of code. By merely replacing the  state-of-theart driver with our driver, formal verification  of the entire software stack completes in 7x less time,  while the verified functions’ throughput improves by  160%. Our driver also beats, on realistic workloads, the  throughput of drivers that cannot yet be formally verified,  thanks to its low variability and resource use. Our code is  available at github.com/dslab-epfl/tinynf.},
-      url = {http://infoscience.epfl.ch/record/282923},
-}
+ title={SafetyPin: Encrypted Backups with Human-Memorable Secrets},
+ author={Dauterman, Emma and Corrigan-Gibbs, Henry and Mazieres,  David},
+ abstract={We present the design and implementation of SafetyPin, a  system for encrypted mobile-device backups. Like existing  cloud-based mobile-backup systems, including those of Apple  and Google, SafetyPin requires users to remember only a  short PIN and defends against brute-force PIN-guessing  attacks using hardware security protections. Unlike today's  systems, SafetyPin splits trust over a cluster of hardware  security modules (HSMs) in order to provide security  guarantees that scale with the number of HSMs. In this way,  SafetyPin protects backed-up user data even against an  attacker that can adaptively compromise many of the  system's constituent HSMs. SafetyPin provides this  protection without sacrificing scalability or fault  tolerance. Decentralizing trust while respecting the  resource limits of today's HSMs requires a synthesis of  systems-design principles and cryptographic tools. We  evaluate SafetyPin on a cluster of 100 low-cost HSMs and  show that a SafetyPin-protected recovery takes 1.01  seconds. To process 1B recoveries a venue_short, we estimate that  a SafetyPin deployment would need 3,100 low-cost HSMs.},
+ url={http://infoscience.epfl.ch/record/287869},
+ venue_short={OSDI '20}}
+
+
 @article{Cohn-Gordon:287859,
-      title = {DELF: Safeguarding Deletion Correctness in Online Social  Networks},
-      author = {Cohn-Gordon, Katriel and Damaskinos, Georgios and Neto,  Divino and Cordova, Shi and Reitz, Benoit and Strahs,  Benjamin and Obenshain, Daniel and Pearce, Paul and  Papagiannis, Loannis},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 29th Usenix Security Symposium},
-      address = {Berkeley},
-      pages = {1057-1074},
-      venue_short = {USENIX Security '20},
-      year = {2020},
-      abstract = {Deletion is a core facet of Online Social Networks (OSNs).  For users, deletion is a tool to remove what they have  shared and control their data. For OSNs, robust deletion is  both an obligation to their users and a risk when developer  mistakes inevitably occur. While developers are effective  at identifying high-level deletion requirements in products  (e.g., users should be able to delete posted photos), they  are less effective at mapping high-level requirements into  concrete operations (e.g., deleting all relevant items in  data stores). Without framework support, developer mistakes  lead to violations of users' privacy, such as retaining  data that should be deleted, deleting the wrong data, and  exploitable vulnerabilities.},
-      url = {http://infoscience.epfl.ch/record/287859},
-}
-@article{Ispoglou:287822,
-      title = {FuzzGen: Automatic Fuzzer Generation},
-      author = {Ispoglou, Kyriakos K. and Austin, Daniel and Mohan,  Vishwath and Payer, Mathias},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 29th Usenix Security Symposium},
-      address = {Berkeley},
-      pages = {2271-2287},
-      venue_short = {USENIX Security '20},
-      year = {2020},
-      abstract = {Fuzzing is a testing technique to discover unknown  vulnerabilities in software. When applying fuzzing to  libraries, the core idea of supplying random input remains  unchanged, yet it is non-trivial to achieve good code  coverage. Libraries cannot run as standalone programs, but  instead are invoked through another application. Triggering  code deep in a library remains challenging as specific  sequences of API calls are required to build up the  necessary state. Libraries are diverse and have unique  interfaces that require unique fuzzers, so far written by a  human analyst.},
-      url = {http://infoscience.epfl.ch/record/287822},
-}
-@article{Clements:287890,
-      title = {HALucinator: Firmware Re-hosting Through Abstraction Layer  Emulation},
-      author = {Clements, Abraham A. and Gustafson, Eric and Scharnowski,  Tobias and Grosen, Paul and Fritz, David and Kruegel,  Christopher and Vigna, Giovanni and Bagchi, Saurabh and  Payer, Mathias},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 29th Usenix Security Symposium},
-      address = {Berkeley},
-      pages = {1201-1218},
-      venue_short = {USENIX Security '20},
-      year = {2020},
-      abstract = {Given the increasing ubiquity of online embedded devices,  analyzing their firmware is important to security, privacy,  and safety. The tight coupling between hardware and  firmware and the diversity found in embedded systems makes  it hard to perform dynamic analysis on firmware. However,  firmware developers regularly develop code using  abstractions, such as Hardware Abstraction Layers (HALs),  to simplify their job. We leverage such abstractions as the  basis for the re-hosting and analysis of firmware. By  providing high-level replacements for HAL functions (a  process termed High-Level Emulation - HLE), we decouple the  hardware from the firmware. This approach works by first  locating the library functions in a firmware sample,  through binary analysis, and then providing generic  implementations of these functions in a full-system  emulator.},
-      url = {http://infoscience.epfl.ch/record/287890},
-}
-@article{Peng:287847,
-      title = {USBFuzz: A Framework for Fuzzing USB Drivers by Device  Emulation},
-      author = {Peng, Hui and Payer, Mathias},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 29th Usenix Security Symposium},
-      address = {Berkeley},
-      pages = {2559-2575},
-      venue_short = {USENIX Security '20},
-      year = {2020},
-      abstract = {The Universal Serial Bus (USB) connects external devices  to a host. This interface exposes the OS kernels and device  drivers to attacks by malicious devices. Unfortunately,  kernels and drivers were developed under a security model  that implicitly trusts connected devices. Drivers expect  faulty hardware but not malicious attacks. Similarly,  security testing drivers is challenging as input must cross  the hardware/software barrier. Fuzzing, the most widely  used bug finding technique, relies on providing random data  to programs. However, fuzzing device drivers is challenging  due to the difficulty in crossing the hardware/software  barrier and providing random device data to the driver  under test.},
-      url = {http://infoscience.epfl.ch/record/287847},
-}
-@article{Anadiotis:282822,
-      title = {A System Design for Elastically Scaling Transaction  Processing Engines in Virtualized Servers},
-      author = {Anadiotis, Angelos-Christos and Appuswamy, Raja and  Ailamaki, Anastasia and Bronshtein, Ilan and Avni, Hillel  and Dominguez-Sal, David and Goikhman, Shay and Levy,  Eliezer},
-      publisher = {ASSOC COMPUTING MACHINERY},
-      journal = {Proceedings of the VLDB Endowment},
-      address = {New York},
-      number = {12},
-      volume = {13},
-      pages = {3085-3098},
-      venue_short = {VLDB '20},
-      year = {2020},
-      abstract = {Online Transaction Processing (OLTP) deployments are  migrating from on-premise to cloud settings in order to  exploit the elasticity of cloud infrastructure which allows  them to adapt to workload variations. However, cloud  adaptation comes at the cost of redesigning the engine,  which has led to the introduction of several, new,  cloud-based transaction processing systems mainly focusing  on: (i) the transaction coordination protocol, (ii) the  data partitioning strategy, and, (iii) the resource  isolation across multiple tenants. As a result, standalone  OLTP engines cannot be easily deployed with an elastic  setting in the cloud and they need to migrate to another,  specialized deployment.},
-      url = {http://infoscience.epfl.ch/record/282822},
-      doi = {10.14778/3415478.3415536},
-}
-@article{Sirin:280879,
-      title = {Micro-architectural Analysis of OLAP: Limitations and  Opportunities},
-      author = {Sirin, Utku and Ailamaki, Anastasia},
-      publisher = {ASSOC COMPUTING MACHINERY},
-      journal = {Proceedings of the VLDB Endowment},,
-      address = {New York},
-      number = {6},
-      volume = {13},
-      pages = {840-853},
-      venue_short = {VLDB '20},
-      year = {2020},
-      abstract = {Understanding micro-architectural behavior is important  for efficiently using hardware resources. Recent work has  shown that in-memory online transaction processing (OLTP)  systems severely underutilize their core micro-architecture  resources [29]. Whereas, online analytical processing  (OLAP) workloads exhibit a completely different computing  pattern. OLAP workloads are read-only, bandwidth-intensive,  and include various data access patterns. With the rise of  column-stores, they run on high-performance engines that  are tightly optimized for modern hardware. Consequently,  micro-architectural behavior of modern OLAP systems remains  unclear.},
-      url = {http://infoscience.epfl.ch/record/280879},
-      doi = {10.14778/3380750.3380755},
-}
+ title={DELF: Safeguarding Deletion Correctness in Online Social  Networks},
+ author={Cohn-Gordon, Katriel and Damaskinos, Georgios and Neto,  Divino and Cordova, Shi and Reitz, Benoit and Strahs,  Benjamin and Obenshain, Daniel and Pearce, Paul and  Papagiannis, Loannis},
+ abstract={Deletion is a core facet of Online Social Networks (OSNs).  For users, deletion is a tool to remove what they have  shared and control their data. For OSNs, robust deletion is  both an obligation to their users and a risk when developer  mistakes inevitably occur. While developers are effective  at identifying high-level deletion requirements in products  (e.g., users should be able to delete posted photos), they  are less effective at mapping high-level requirements into  concrete operations (e.g., deleting all relevant items in  data stores). Without framework support, developer mistakes  lead to violations of users' privacy, such as retaining  data that should be deleted, deleting the wrong data, and  exploitable vulnerabilities.},
+ url={http://infoscience.epfl.ch/record/287859},
+ venue_short={USENIX Security '20}}
+
+
 @article{Olma:275745,
-      title = {Adaptive Partitioning and Indexing for In-situ Query  Processing},
-      author = {Olma, Matthaios and Karpathiotakis, Manos and Alagiannis,  Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia},
-      publisher = {SPRINGER},
-      journal = {The VLDB Journal},
-      address = {New York},
-      number = {1},
-      volume = {29},
-      pages = {569-591},
-      venue_short = {VLDBJ '20},
-      year = {2020},
-      abstract = {The constant flux of data and queries alike has been  pushing the boundaries of data analysis systems. The  increasing size of raw data files has made data loading an  expensive operation that delays the data-to-insight time.  To alleviate the loading cost, in situ query processing  systems operate directly over raw data and offer instant  access to data. At the same time, analytical workloads have  increasing number of queries. Typically, each query focuses  on a constantly shifting-yet small-range. As a result,  minimizing the workload latency requires the benefits of  indexing in in situ query processing. In this paper, we  present an online partitioning and indexing scheme, along  with a partitioning and indexing tuner tailored for in situ  querying engines. The proposed system design improves query  execution time by taking into account user query patterns,  to (i) partition raw data files logically and (ii) build  lightweight partition-specific indexes for each partition.  We build an in situ query engine called Slalom to showcase  the impact of our design. Slalom employs adaptive  partitioning and builds non-obtrusive indexes in different  partitions on-the-fly based on lightweight query access  pattern monitoring. As a result of its lightweight nature,  Slalom achieves efficient query processing over raw data  with minimal memory consumption. Our experimentation with  both microbenchmarks and real-life workloads shows that  Slalom outperforms state-of-the-art in situ engines and  achieves comparable query response times with fully indexed  DBMS, offering lower cumulative query execution times for  query workloads with increasing size and unpredictable  access patterns.},
-      url = {http://infoscience.epfl.ch/record/275745},
-      doi = {10.1007/s00778-019-00580-x},
-}
-@article{Daglis:265809,
-      title = {RPCValet: NI-Driven Tail-Aware Balancing of µs-Scale  RPCs},
-      author = {Daglis, Alexandros and Sutherland, Mark and Falsafi,  Babak},
-      publisher = {ACM},
-      journal = {Proceedings of the Twenty-Fourth International Conference  on Architectural Support for Programming Languages and  Operating Systems (ASPLOS'19)},
-      address = {New York, NY},
-      pages = {35-48},
-      venue_short = {ASPLOS '19},
-      year = {2019},
-      abstract = {Modern online services come with stringent quality  requirements in terms of response time tail latency.  Because of their decomposition into fine-grained  communicating software layers, a single user request fans  out into a plethora of short, μs-scale RPCs, aggravating  the need for faster inter-server communication. In reaction  to that need, we are witnessing a technological transition  characterized by the emergence of hardware-terminated  user-level protocols (e.g., InfiniBand/RDMA) and new  architectures with fully integrated Network Interfaces  (NIs). Such architectures offer a unique opportunity for a  new NI-driven approach to balancing RPCs among the cores of  manycore server CPUs, yielding major tail latency  improvements for μs-scale RPCs. We introduce RPCValet, an  NI-driven RPC load-balancing design for architectures with  hardware-terminated protocols and integrated NIs, that  delivers near-optimal tail latency. RPCValet's RPC dispatch  decisions emulate the theoretically optimal single-queue  system, without incurring synchronization overheads  currently associated with single-queue implementations. Our  design improves throughput under tight tail latency goals  by up to 1.4x, and reduces tail latency before saturation  by up to 4x for RPCs with μs-scale service times, as  compared to current systems with hardware support for RPC  load distribution. RPCValet performs within 15% of the  theoretically optimal single-queue system.},
-      url = {http://infoscience.epfl.ch/record/265809},
-      doi = {10.1145/3297858.3304070},
-}
+ title={Adaptive Partitioning and Indexing for In-situ Query  Processing},
+ author={Olma, Matthaios and Karpathiotakis, Manos and Alagiannis,  Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia},
+ abstract={The constant flux of data and queries alike has been  pushing the boundaries of data analysis systems. The  increasing size of raw data files has made data loading an  expensive operation that delays the data-to-insight time.  To alleviate the loading cost, in situ query processing  systems operate directly over raw data and offer instant  access to data. At the same time, analytical workloads have  increasing number of queries. Typically, each query focuses  on a constantly shifting-yet small-range. As a result,  minimizing the workload latency requires the benefits of  indexing in in situ query processing. In this paper, we  present an online partitioning and indexing scheme, along  with a partitioning and indexing tuner tailored for in situ  querying engines. The proposed system design improves query  execution time by taking into account user query patterns,  to (i) partition raw data files logically and (ii) build  lightweight partition-specific indexes for each partition.  We build an in situ query engine called Slalom to showcase  the impact of our design. Slalom employs adaptive  partitioning and builds non-obtrusive indexes in different  partitions on-the-fly based on lightweight query access  pattern monitoring. As a result of its lightweight nature,  Slalom achieves efficient query processing over raw data  with minimal memory consumption. Our experimentation with  both microbenchmarks and real-life workloads shows that  Slalom outperforms state-of-the-art in situ engines and  achieves comparable query response times with fully indexed  DBMS, offering lower cumulative query execution times for  query workloads with increasing size and unpredictable  access patterns.},
+ url={http://infoscience.epfl.ch/record/275745},
+ venue_short={VLDBJ '20}}
+
+
+@article{PourhabibiZarandi:274129,
+ title={Optimus Prime: Accelerating Data Transformation in  Servers},
+ author={Pourhabibi Zarandi, Arash and Gupta, Siddharth and Kassir,  Hussein and Sutherland, Mark Johnathon and Tian, Zilu and  Drumond Lages De Oliveira, Mario Paulo and Falsafi, Babak  and Koch, Christoph},
+ abstract={Modern online services are shifting away from monolithic  applications to loosely-coupled microservices because of  their improved scalability, reliability, programmability  and development velocity. Microservices communicating over  the datacenter network require data transformation (DT) to  convert messages back and forth between their internal  formats. This work identifies DT as a bottleneck due to  reductions in latency of the surrounding system components,  namely application runtimes, protocol stacks, and network  hardware. We therefore propose Optimus Prime (OP), a  programmable DT accelerator that uses a novel abstraction,  an in-memory schema, to represent DT operations. The schema  is compatible with today's DT frameworks and enables any  compliant accelerator to perform the transformations  comprising a request in parallel. Our evaluation shows that  OP's DT throughput matches the line rate of today's NICs  and has 60x higher throughput compared to software, at a  tiny fraction of the CPU's silicon area and power. We also  evaluate a set of microservices running on Thrift, and show  up to 30% reduction in service latency.},
+ url={http://infoscience.epfl.ch/record/274129},
+ venue_short={ASPLOS '20}}
+
+
+@article{Sutherland:277391,
+ title={The NEBULA RPC-Optimized Architecture},
+ author={Sutherland, Mark and Gupta, Siddharth and Falsafi, Babak  and Marathe, Virendra and Pnevmatikatos, Dionisios and  Daglis, Alexandros and Sutherland, Mark and Gupta,  Siddharth and Falsafi, Babak},
+ abstract={Large-scale online services are commonly structured as a  network of software tiers, which communicate over the  datacenter network using RPCs. Ongoing trends towards  software decomposition have led to the prevalence of tiers  receiving and generating RPCs with runtimes of only a few  microseconds. With such small software runtimes, even the  smallest latency overheads in RPC handling have a  significant relative performance impact. In particular, we  find that growing network bandwidth introduces queuing  effects within a server’s memory hierarchy, considerably  hurting the response latency of fine-grained RPCs. In this  work we introduce NeBuLa, an architecture optimized to  accelerate the most challenging microsecond-scale RPCs, by  leveraging two novel mechanisms to drastically improve  server throughput under strict tail latency goals. First,  NeBuLa reduces detrimental queuing at the memory  controllers via hardware support for efficient in-LLC  network buffer management. Second, NeBuLa’s network  interface steers incoming RPCs into the CPU cores’ L1  caches, improving RPC startup latency. Our evaluation shows  that NeBuLa boosts the throughput of a state-of-the-art  key- value store by 1.25–2.19x compared to existing  proposals, while maintaining strict tail latency goals.},
+ url={http://infoscience.epfl.ch/record/277391},
+ venue_short={ISCA '20}}
+
+
+@article{Pirelli:282923,
+ title={A Simpler and Faster NIC Driver Model for Network  Functions},
+ author={Pirelli, Solal and Candea, George},
+ abstract={The advent of software network functions calls for  stronger correctness guarantees and higher performance at  every level of the stack. Current network stacks trade  simplicity for performance and flexibility, especially in  their driver model. We show that performance and simplicity  can coexist, at the cost of some flexibility, with a new  NIC driver model tailored to network functions. The key  idea behind our model is that the driver can efficiently  reuse packet buffers because buffers follow a single  logical path. We implement a driver for the Intel 82599  network card in 550 lines of code. By merely replacing the  state-of-theart driver with our driver, formal verification  of the entire software stack completes in 7x less time,  while the verified functions’ throughput improves by  160%. Our driver also beats, on realistic workloads, the  throughput of drivers that cannot yet be formally verified,  thanks to its low variability and resource use. Our code is  available at github.com/dslab-epfl/tinynf.},
+ url={http://infoscience.epfl.ch/record/282923},
+ venue_short={OSDI '20}}
+
+
+@article{Aguilera:287833,
+ title={Microsecond Consensus for Microsecond Applications},
+ author={Aguilera, Marcos K. and Ben-David, Naama and Guerraoui,  Rachid and Marathe, Virendra J. and Xygkis, Athanasios and  Zablotchi, Igor},
+ abstract={We consider the problem of making apps fault-tolerant  through replication, when apps operate at the microsecond  scale, as in finance, embedded computing, and microservices  apps. These apps need a replication scheme that also  operates at the microsecond scale, otherwise replication  becomes a burden. We propose Mu, a system that takes less  than 1.3 microseconds to replicate a (small) request in  memory, and less than a millisecond to fail-over the  system-this cuts the replication and fail-over latencies of  the prior systems by at least 61% and 90%. Mu implements  bona fide state machine replication/consensus (SMR) with  strong consistency for a generic app, but it really shines  on microsecond apps, where even the smallest overhead is  significant. To provide this performance, Mu introduces a  new SMR protocol that carefully leverages RDMA. Roughly, in  Mu a leader replicates a request by simply writing it  directly to the log of other replicas using RDMA, without  any additional communication. Doing so, however, introduces  the challenge of handling concurrent leaders, changing  leaders, garbage collecting the logs, and more-challenges  that we address in this paper through a judicious  combination of RDMA permissions and distributed algorithmic  design. We implemented Mu and used it to replicate several  systems: a financial exchange app called Liquibook, Redis,  Memcached, and HERD [33]. Our evaluation shows that Mu  incurs a small replication latency, in some cases being the  only viable replication system that incurs an acceptable  overhead.},
+ url={http://infoscience.epfl.ch/record/287833},
+ venue_short={OSDI '20}}
+
+
+@article{Anadiotis:282822,
+ title={A System Design for Elastically Scaling Transaction  Processing Engines in Virtualized Servers},
+ author={Anadiotis, Angelos-Christos and Appuswamy, Raja and  Ailamaki, Anastasia and Bronshtein, Ilan and Avni, Hillel  and Dominguez-Sal, David and Goikhman, Shay and Levy,  Eliezer},
+ abstract={Online Transaction Processing (OLTP) deployments are  migrating from on-premise to cloud settings in order to  exploit the elasticity of cloud infrastructure which allows  them to adapt to workload variations. However, cloud  adaptation comes at the cost of redesigning the engine,  which has led to the introduction of several, new,  cloud-based transaction processing systems mainly focusing  on: (i) the transaction coordination protocol, (ii) the  data partitioning strategy, and, (iii) the resource  isolation across multiple tenants. As a result, standalone  OLTP engines cannot be easily deployed with an elastic  setting in the cloud and they need to migrate to another,  specialized deployment.},
+ url={http://infoscience.epfl.ch/record/282822},
+ venue_short={VLDB '20}}
+
+
+@article{KokorisKogias:287444,
+ title={CALYPSO: Private Data Management for Decentralized  Ledgers},
+ author={Kokoris Kogias, Eleftherios and Alp, Enis Ceyhun and  Gasser, Linus and Jovanovic, Philipp Svetolik and Syta, Ewa  and Ford, Bryan Alexander},
+ abstract={Distributed ledgers provide high availability and  integrity, making them a key enabler for practical and  secure computation of distributed workloads among mutually  distrustful parties. Many practical applications also  require strong confidentiality, however. This work enhances  permissioned and permissionless blockchains with the  ability to manage confidential data without forfeiting  availability or decentralization. The proposed Calypso  architecture addresses two orthogonal challenges  confronting modern distributed ledgers: (a) enabling the  auditable management of secrets and (b) protecting  distributed computations against arbitrage attacks when  their results depend on the ordering and secrecy of inputs.  Calypso introduces on-chain secrets, a novel abstraction  that enforces atomic deposition of an auditable trace  whenever users access confidential data. Calypso provides  user-controlled consent management that ensures revocation  atomicity and accountable anonymity. To enable  permissionless deployment, we introduce an incentive scheme  and provide users with the option to select their preferred  trustees. We evaluated our Calypso prototype with a  confidential document-sharing application and a  decentralized lottery. Our benchmarks show that  transaction-processing latency increases linearly in terms  of security (number of trustees) and is in the range of 0.2  to 8 seconds for 16 to 128 trustees.},
+ url={http://infoscience.epfl.ch/record/287444},
+ venue_short={PVLDB '20}}
+
+
+@article{Giannakopoulou:286040,
+ title={Cleaning Denial Constraint Violations through Relaxation},
+ author={Giannakopoulou, Stella and Karpathiotakis, Manos and  Ailamaki, Anastasia},
+ abstract={Data cleaning is a time-consuming process that depends on  the data analysis that users perform. Existing solutions  treat data cleaning as a separate offline process that  takes place before analysis begins. Applying data cleaning  before analysis assumes a priori knowledge of the  inconsistencies and the query workload, thereby requiring  effort on understanding and cleaning the data that is  unnecessary for the analysis. We propose an approach that  performs probabilistic repair of denial constraint  violations on-demand, driven by the exploratory analysis  that users perform. We introduce Daisy, a system that  seamlessly integrates data cleaning into the analysis by  relaxing query results. Daisy executes analytical  query-workloads over dirty data by weaving cleaning  operators into the query plan. Our evaluation shows that  Daisy adapts to the workload and outperforms traditional  offline cleaning on both synthetic and real-world  workloads.},
+ url={http://infoscience.epfl.ch/record/286040},
+ venue_short={SIGMOD '20}}
+
+
+@article{Raza:286055,
+ title={Adaptive HTAP through Elastic Resource Scheduling},
+ author={Raza, Aunn and Chrysogelos, Periklis and Anadiotis,  Angelos Christos and Ailamaki, Anastasia},
+ abstract={Modern Hybrid Transactional/Analytical Processing (HTAP)  systems use an integrated data processing engine that  performs analytics on fresh data, which are ingested from a  transactional engine. HTAP systems typically consider data  freshness at design time, and are optimized for a fixed  range of freshness requirements, addressed at a performance  cost for either OLTP or OLAP. The data freshness and the  performance requirements of both engines, however, may vary  with the workload. We approach HTAP as a scheduling  problem, addressed at runtime through elastic resource  management. We model an HTAP system as a set of three  individual engines: an OLTP, an OLAP and a Resource and  Data Exchange (RDE) engine. We devise a scheduling  algorithm which traverses the HTAP design spectrum through  elastic resource management, to meet the workload data  freshness requirements. We propose an in-memory system  design which is non-intrusive to the current state-of-art  OLTP and OLAP engines, and we use it to evaluate the  performance of our approach. Our evaluation shows that the  performance benefit of our system for OLAP queries  increases over time, reaching up to 50% compared to static  schedules for 100 query sequences, while maintaining a  small, and controlled, drop in the OLTP throughput.},
+ url={http://infoscience.epfl.ch/record/286055},
+ venue_short={SIGMOD '20}}
+
+
+@article{Dinesh:284290,
+ title={RetroWrite: Statically Instrumenting COTS Binaries for  Fuzzing and Sanitization},
+ author={Dinesh, Sushant and Burow, Nathan and Xu, Dongyan and  Payer, Mathias},
+ abstract={Analyzing the security of closed source binaries is  currently impractical for end-users, or even developers who  rely on third-party libraries. Such analysis relies on  automatic vulnerability discovery techniques, most notably  fuzzing with sanitizers enabled. The current state of the  art for applying fuzzing or sanitization to binaries is  dynamic binary translation, which has prohibitive  performance overhead. The alternate technique, static  binary rewriting, cannot fully recover symbolization  information and hence has difficulty modifying binaries to  track code coverage for fuzzing or to add security checks  for sanitizers.},
+ url={http://infoscience.epfl.ch/record/284290},
+ venue_short={IEEE Security and Privacy '20}}
+
+
 @article{Oh:276919,
-      title = {Linebacker: Preserving Victim Cache Lines in Idle Register  Files of GPUs},
-      author = {Oh, Yunho and Koo, Gunjae and Annavaram, Murali and Ro,  Won Woo},
-      publisher = {ASSOC COMPUTING MACHINERY},
-      journal = {Proceedings Of The 2019 46Th International Symposium On  Computer Architecture (ISCA'19)},
-      address = {New York},
-      pages = {183-196},
-      venue_short = {ISCA '19},
-      year = {2019},
-      abstract = {Modern GPUs suffer from cache contention due to the  limited cache size that is shared across tens of  concurrently running warps. To increase the per-warp cache  size prior techniques proposed warp throttling which limits  the number of active warps. Warp throttling leaves several  registers to be dynamically unused whenever a warp is  throttled. Given the stringent cache size limitation in  GPUs this work proposes a new cache management technique  named Linebacker (LB) that improves GPU performance by  utilizing idle register file space as victim cache space.  Whenever a CTA becomes inactive, linebacker backs up the  registers of the throttled CTA to the off-chip memory.  Then, linebacker utilizes the corresponding register file  space as victim cache space. If any load instruction finds  data in the victim cache line, the data is directly copied  to the destination register through a simple  register-register move operation. To further improve the  efficiency of victim cache linebacker allocates victim  cache space only to a select few load instructions that  exhibit high data locality. Through a careful design of  victim cache indexing and management scheme linebacker  provides 29.0% of speedup compared to the previously  proposed warp throttling techniques.},
-      url = {http://infoscience.epfl.ch/record/276919},
-      doi = {10.1145/3307650.3322222},
-}
-@article{Gupta:270293,
-      title = {Distributed Logless Atomic Durability with Persistent  Memory},
-      author = {Gupta, Siddharth and Daglis, Alexandros and Falsafi,  Babak},
-      journal = {Proceedings of the 52nd International Symposium on  Microarchitecture (MICRO'19)},
-      pages = {13},
-      venue_short = {MICRO '19},
-      year = {2019},
-      abstract = {Datacenter operators have started deploying Persistent  Memory (PM), leveraging its combination of fast access and  persistence for significant performance gains. A key  challenge for PM-aware software is to maintain high  performance while achieving atomic durability. The latter  typically requires the use of logging, which introduces  considerable overhead with additional CPU cycles, write  traffic, and ordering requirements. In this paper, we  exploit the data multiversioning inherent in the memory  hierarchy to achieve atomic durability without logging. Our  design, LAD, relies on persistent buffering space at the  memory controllers (MCs)—already present in modern  CPUs—to speculatively accumulate all of a transaction’s  updates before they are all atomically committed to PM. LAD  employs an on-chip distributed commit protocol in hardware  to manage the distributed speculative state each  transaction accumulates across multiple MCs. We demonstrate  that LAD is a practical design relying on modest hardware  modifications to provide atomically durable transactions,  while delivering up to 80% of ideal—i.e., PM-oblivious  software’s—performance.},
-      url = {http://infoscience.epfl.ch/record/270293},
-      doi = {10.1145/3352460.3358321},
-}
+ title={Linebacker: Preserving Victim Cache Lines in Idle Register  Files of GPUs},
+ author={Oh, Yunho and Koo, Gunjae and Annavaram, Murali and Ro,  Won Woo},
+ abstract={Modern GPUs suffer from cache contention due to the  limited cache size that is shared across tens of  concurrently running warps. To increase the per-warp cache  size prior techniques proposed warp throttling which limits  the number of active warps. Warp throttling leaves several  registers to be dynamically unused whenever a warp is  throttled. Given the stringent cache size limitation in  GPUs this work proposes a new cache management technique  named Linebacker (LB) that improves GPU performance by  utilizing idle register file space as victim cache space.  Whenever a CTA becomes inactive, linebacker backs up the  registers of the throttled CTA to the off-chip memory.  Then, linebacker utilizes the corresponding register file  space as victim cache space. If any load instruction finds  data in the victim cache line, the data is directly copied  to the destination register through a simple  register-register move operation. To further improve the  efficiency of victim cache linebacker allocates victim  cache space only to a select few load instructions that  exhibit high data locality. Through a careful design of  victim cache indexing and management scheme linebacker  provides 29.0% of speedup compared to the previously  proposed warp throttling techniques.},
+ url={http://infoscience.epfl.ch/record/276919},
+ venue_short={ISCA '19}}
+
+
+@article{Shamis:273429,
+ title={Fast General Distributed Transactions with Opacity},
+ author={Shamis, Alex and Renzelmann, Matthew and Novakovic, Stanko  and Chatzopoulos, Georgios and Dragojevic, Aleksandar and  Narayanan, Dushyanth and Castro, Miguel},
+ abstract={Transactions can simplify distributed applications by  hiding data distribution, concurrency, and failures from  the application developer. Ideally the developer would see  the abstraction of a single large machine that runs  transactions sequentially and never fails. This requires  the transactional subsystem to provide opacity (strict  serializability for both committed and aborted  transactions), as well as transparent fault tolerance with  high availability. As even the best abstractions are  unlikely to be used if they perform poorly, the system must  also provide high performance. Existing distributed  transactional designs either weaken this abstraction or are  not designed for the best performance within a data center.  This paper extends the design of FaRM - which provides  strict serializability only for committed transactions - to  provide opacity while maintaining FaRM's high throughput,  low latency, and high availability within a modern data  center. It uses timestamp ordering based on real time with  clocks synchronized to within tens of microseconds across a  cluster, and a failover protocol to ensure correctness  across clock master failures. FaRM with opacity can commit  5.4 million neworder transactions per second when running  the TPC-C transaction mix on 90 machines with 3-way  replication.},
+ url={http://infoscience.epfl.ch/record/273429},
+ venue_short={SIGMOD '19}}
+
+
+@article{Cohen:263802,
+ title={Fine-Grain Checkpointing with In-Cache-Line Logging},
+ author={Cohen, Nachshon and Aksun, David Teksen and Larus, James},
+ abstract={Non-Volatile Memory offers the possibility of implementing  high-performance, durable data structures. However,  achieving performance comparable to well-designed data  structures in non-persistent (transient) memory is  difficult, primarily because of the cost of ensuring the  order in which memory writes reach NVM. Often, this  requires flushing data to NVM and waiting a full memory  round-trip time. In this paper, we introduce two new  techniques: Fine- Grained Checkpointing, which ensures a  consistent, quickly recoverable data structure in NVM after  a system failure, and In-Cache-Line Logging, an  undo-logging technique that enables recovery of earlier  state without requiring cache- line flushes in the normal  case. We implemented these techniques in the Masstree data  structure, making it persistent and demonstrating the ease  of applying them to a highly op- timized system and their  low (5.9-15.4%) runtime overhead cost.},
+ url={http://infoscience.epfl.ch/record/263802},
+ venue_short={ASPLOS '19}}
+
+
+@article{Daglis:265809,
+ title={RPCValet: NI-Driven Tail-Aware Balancing of µs-Scale  RPCs},
+ author={Daglis, Alexandros and Sutherland, Mark and Falsafi,  Babak},
+ abstract={Modern online services come with stringent quality  requirements in terms of response time tail latency.  Because of their decomposition into fine-grained  communicating software layers, a single user request fans  out into a plethora of short, μs-scale RPCs, aggravating  the need for faster inter-server communication. In reaction  to that need, we are witnessing a technological transition  characterized by the emergence of hardware-terminated  user-level protocols (e.g., InfiniBand/RDMA) and new  architectures with fully integrated Network Interfaces  (NIs). Such architectures offer a unique opportunity for a  new NI-driven approach to balancing RPCs among the cores of  manycore server CPUs, yielding major tail latency  improvements for μs-scale RPCs. We introduce RPCValet, an  NI-driven RPC load-balancing design for architectures with  hardware-terminated protocols and integrated NIs, that  delivers near-optimal tail latency. RPCValet's RPC dispatch  decisions emulate the theoretically optimal single-queue  system, without incurring synchronization overheads  currently associated with single-queue implementations. Our  design improves throughput under tight tail latency goals  by up to 1.4x, and reduces tail latency before saturation  by up to 4x for RPCs with μs-scale service times, as  compared to current systems with hardware support for RPC  load distribution. RPCValet performs within 15% of the  theoretically optimal single-queue system.},
+ url={http://infoscience.epfl.ch/record/265809},
+ venue_short={ASPLOS '19}}
+
+
+@article{Bhattacharyya:270782,
+ title={SMoTherSpectre: Exploiting Speculative Execution through  Port Contention},
+ author={Bhattacharyya, Atri and Sandulescu, Alexandra and  Neugschwandtner, Matthias and Sorniotti, Alessandro and  Falsafi, Babak and Payer, Mathias Josef and Kurmus, Anil},
+ abstract={Spectre, Meltdown, and related attacks have demonstrated  that kernels, hypervisors, trusted execution environments,  and browsers are prone to information disclosure through  micro-architectural weaknesses. However, it remains unclear  as to what extent other applications, in particular those  that do not load attacker-provided code, may be impacted.  It also remains unclear as to what extent these attacks are  reliant on cache-based side channels. We introduce  SMoTherSpectre, a speculative code-reuse attack that  leverages port-contention in simultaneously multi-threaded  processors (SMoTher) as a side channel to leak information  from a victim process. SMoTher is a fine-grained side  channel that detects contention based on a single victim  instruction. To discover real-world gadgets, we describe a  methodology and build a tool that locates SMoTher-gadgets  in popular libraries. In an evaluation on glibc, we found  hundreds of gadgets that can be used to leak information.  Finally, we demonstrate proof-of-concept attacks against  the OpenSSH server, creating oracles for determining four  host key bits, and against an application performing  encryption using the OpenSSL library, creating an oracle  which can differentiate a bit of the plaintext through  gadgets in libcrypto and glibc.},
+ url={http://infoscience.epfl.ch/record/270782},
+ venue_short={CCS '19}}
+
+
+@article{Srivastava:276976,
+ title={FirmFuzz: Automated IoT Firmware Introspection and  Analysis},
+ author={Srivastava, Prashast and Peng, Hui and Li, Jiahao and  Okhravi, Hamed and Shrobe, Howard and Payer, Mathias},
+ abstract={While the number of IoT devices grows at an exhilarating  pace their security remains stagnant. Imposing secure  coding standards across all vendors is infeasible. Testing  individual devices allows an analyst to evaluate their  security post deployment. Any discovered vulnerabilities  can then be disclosed to the vendors in order to assist  them in securing their products. The search for  vulnerabilities should ideally be automated for efficiency  and furthermore be device-independent for scalability.},
+ url={http://infoscience.epfl.ch/record/276976},
+ venue_short={CCS '19}}
+
+
 @article{Margaritov:276507,
-      title = {Prefetched Address Translation},
-      author = {Margaritov, Artemiy and Ustiugov, Dmitrii and Bugnion,  Edouard and Grot, Boris},
-      publisher = {ASSOC COMPUTING MACHINERY},
-      journal = {Proceedings of the 52nd International Symposium on  Microarchitecture (MICRO'19)},
-      address = {New York},
-      pages = {1023-1036},
-      venue_short = {MICRO '19},
-      year = {2019},
-      abstract = {With explosive growth in dataset sizes and increasing  machine memory capacities, per-application memory  footprints are commonly reaching into hundreds of GBs. Such  huge datasets pressure the TLB, resulting in frequent  misses that must be resolved through a page walk - a  long-latency pointer chase through multiple levels of the  in-memory radix tree-based page table.},
-      url = {http://infoscience.epfl.ch/record/276507},
-      doi = {10.1145/3352460.3358294},
-}
+ title={Prefetched Address Translation},
+ author={Margaritov, Artemiy and Ustiugov, Dmitrii and Bugnion,  Edouard and Grot, Boris},
+ abstract={With explosive growth in dataset sizes and increasing  machine memory capacities, per-application memory  footprints are commonly reaching into hundreds of GBs. Such  huge datasets pressure the TLB, resulting in frequent  misses that must be resolved through a page walk - a  long-latency pointer chase through multiple levels of the  in-memory radix tree-based page table.},
+ url={http://infoscience.epfl.ch/record/276507},
+ venue_short={MICRO '19}}
+
+
+@article{Gupta:270293,
+ title={Distributed Logless Atomic Durability with Persistent  Memory},
+ author={Gupta, Siddharth and Daglis, Alexandros and Falsafi,  Babak},
+ abstract={Datacenter operators have started deploying Persistent  Memory (PM), leveraging its combination of fast access and  persistence for significant performance gains. A key  challenge for PM-aware software is to maintain high  performance while achieving atomic durability. The latter  typically requires the use of logging, which introduces  considerable overhead with additional CPU cycles, write  traffic, and ordering requirements. In this paper, we  exploit the data multiversioning inherent in the memory  hierarchy to achieve atomic durability without logging. Our  design, LAD, relies on persistent buffering space at the  memory controllers (MCs)—already present in modern  CPUs—to speculatively accumulate all of a transaction’s  updates before they are all atomically committed to PM. LAD  employs an on-chip distributed commit protocol in hardware  to manage the distributed speculative state each  transaction accumulates across multiple MCs. We demonstrate  that LAD is a practical design relying on modest hardware  modifications to provide atomically durable transactions,  while delivering up to 80% of ideal—i.e., PM-oblivious  software’s—performance.},
+ url={http://infoscience.epfl.ch/record/270293},
+ venue_short={MICRO '19}}
+
+
 @article{Iyer:264778,
-      title = {Performance Contracts for Software Network Functions},
-      author = {Iyer, Rishabh Ramesh and Pedrosa, Luis David and  Zaostrovnykh, Arseniy and Pirelli, Solal and Argyraki,  Katerina and Candea, George},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings of the ACM Symposium on Networked Systems  Design and Implementation (NSDI'19)},
-      address = {Berkeley},
-      pages = {517-530},
-      venue_short = {NSDI '19},
-      year = {2019},
-      abstract = {Software network functions (NFs), or middleboxes, promise  flexibility and easy deployment of network services but  face the serious challenge of unexpected performance  behaviour. We propose the notion of a performance contract,  a construct formulated in terms of performance critical  variables, that provides a precise description of NF  performance. Performance contracts enable fine-grained  prediction and scrutiny of NF performance for arbitrary  workloads, without having to run the NF itself. We describe  BOLT, a technique and tool for computing such performance  contracts for the entire software stack of NFs written in  C, including the core NF logic, DPDK packet processing  framework, and NIC driver. BOLT takes as input the NF  implementation code and outputs the corresponding contract.  Under the covers, it combines pre-analysis of a library of  stateful NF data structures with automated symbolic  execution of the NF’s code. We evaluate BOLT on four  NFs—a Maglev-like load balancer, a NAT, an LPM router,  and a MAC bridge—and show that its performance contracts  predict the dynamic instruction count and memory access  count with a maximum gap of 7% between the real execution  and the conservatively predicted upper bound. With further  engineering, this gap can be reduced.},
-      url = {http://infoscience.epfl.ch/record/264778},
-}
-@article{Shamis:273429,
-      title = {Fast General Distributed Transactions with Opacity},
-      author = {Shamis, Alex and Renzelmann, Matthew and Novakovic, Stanko  and Chatzopoulos, Georgios and Dragojevic, Aleksandar and  Narayanan, Dushyanth and Castro, Miguel},
-      publisher = {ASSOC COMPUTING MACHINERY},
-      journal = {Proceedings Of The 2019 International  Conference On Management Of Data (SIGMOD'19)},
-      address = {New York},
-      series = {International Conference on Management of Data},
-      pages = {433-448},
-      venue_short = {SIGMOD '19},
-      year = {2019},
-      abstract = {Transactions can simplify distributed applications by  hiding data distribution, concurrency, and failures from  the application developer. Ideally the developer would see  the abstraction of a single large machine that runs  transactions sequentially and never fails. This requires  the transactional subsystem to provide opacity (strict  serializability for both committed and aborted  transactions), as well as transparent fault tolerance with  high availability. As even the best abstractions are  unlikely to be used if they perform poorly, the system must  also provide high performance. Existing distributed  transactional designs either weaken this abstraction or are  not designed for the best performance within a data center.  This paper extends the design of FaRM - which provides  strict serializability only for committed transactions - to  provide opacity while maintaining FaRM's high throughput,  low latency, and high availability within a modern data  center. It uses timestamp ordering based on real time with  clocks synchronized to within tens of microseconds across a  cluster, and a failover protocol to ensure correctness  across clock master failures. FaRM with opacity can commit  5.4 million neworder transactions per second when running  the TPC-C transaction mix on 90 machines with 3-way  replication.},
-      url = {http://infoscience.epfl.ch/record/273429},
-      doi = {10.1145/3299869.3300069},
-}
+ title={Performance Contracts for Software Network Functions},
+ author={Iyer, Rishabh Ramesh and Pedrosa, Luis David and  Zaostrovnykh, Arseniy and Pirelli, Solal and Argyraki,  Katerina and Candea, George},
+ abstract={Software network functions (NFs), or middleboxes, promise  flexibility and easy deployment of network services but  face the serious challenge of unexpected performance  behaviour. We propose the notion of a performance contract,  a construct formulated in terms of performance critical  variables, that provides a precise description of NF  performance. Performance contracts enable fine-grained  prediction and scrutiny of NF performance for arbitrary  workloads, without having to run the NF itself. We describe  BOLT, a technique and tool for computing such performance  contracts for the entire software stack of NFs written in  C, including the core NF logic, DPDK packet processing  framework, and NIC driver. BOLT takes as input the NF  implementation code and outputs the corresponding contract.  Under the covers, it combines pre-analysis of a library of  stateful NF data structures with automated symbolic  execution of the NF’s code. We evaluate BOLT on four  NFs—a Maglev-like load balancer, a NAT, an LPM router,  and a MAC bridge—and show that its performance contracts  predict the dynamic instruction count and memory access  count with a maximum gap of 7% between the real execution  and the conservatively predicted upper bound. With further  engineering, this gap can be reduced.},
+ url={http://infoscience.epfl.ch/record/264778},
+ venue_short={NSDI '19}}
+
+
+@article{Chrysogelos:262531,
+ title={HetExchange: Encapsulating heterogeneous CPU-GPU  parallelism in JIT compiled engines},
+ author={Chrysogelos, Periklis and Karpathiotakis, Manos and  Appuswamy, Raja and Ailamaki, Anastasia},
+ abstract={Modern server hardware is increasingly heterogeneous as  hardware accelerators, such as GPUs, are used together with  multicore CPUs to meet the computational demands of modern  data analytics workloads. Unfortunately, query  parallelization techniques used by analytical database  engines are designed for homogeneous multicore servers,  where query plans are parallelized across CPUs to process  data stored in cache coherent shared memory. Thus, these  techniques are unable to fully exploit available  heterogeneous hardware, where one needs to exploit  task-parallelism of CPUs and data-parallelism of GPUs for  processing data stored in a deep, non-cache-coherent memory  hierarchy with widely varying access latencies and  bandwidth. In this paper, we introduce HetExchange–a  parallel query execution framework that encapsulates the  heterogeneous parallelism of modern multi-CPU–multi-GPU  servers and enables the parallelization of (pre-)existing  sequential relational operators. In contrast to the  interpreted nature of traditional Exchange, HetExchange is  designed to be used in conjunction with JIT compiled  engines in order to allow a tight integration with the  proposed operators and generation of efficient code for  heterogeneous hardware. We validate the applicability and  efficiency of our design by building a prototype that can  operate over both CPUs and GPUs, and enables its operators  to be parallelism- and data-location-agnostic. In doing so,  we show that efficiently exploiting CPU–GPU parallelism  can provide 2.8x and 6.4x improvement in performance  compared to state-of-the-art CPU-based and GPU-based DBMS.},
+ url={http://infoscience.epfl.ch/record/262531},
+ venue_short={PVLDB '19}}
+
+
+@article{Nikolopoulos:265801,
+ title={Retroactive Packet Sampling for Traffic Receipts},
+ author={Nikolopoulos, Pavlos and Pappas, Christos and Argyraki,  Katerina and Perrig, Adrian},
+ abstract={Is it possible to design a packet-sampling algorithm that  prevents the network node that performs the sampling from  treating the sampled packets preferentially? We study this  problem in the context of designing a "network  transparency" system. In this system, networks emit  receipts for a small sample of the packets they observe,  and a monitor collects these receipts to estimate each  network's loss and delay performance. Sampling is a good  building block for this system, because it enables a  solution that is flexible and combines low resource cost  with quantifiable accuracy. The challenge is cheating  resistance: when a network's performance is assessed based  on the conditions experienced by a small traffic sample,  the network has a strong incentive to treat the sampled  packets better than the rest. We contribute a sampling  algorithm that is provably robust to such prioritization  attacks, enables network performance estimation with  quantifiable accuracy, and requires minimal resources. We  confirm our analysis using real traffic traces.},
+ url={http://infoscience.epfl.ch/record/265801},
+ venue_short={SIGMETRICS '19}}
+
+
 @article{Zastrovnykh:275534,
-      title = {Verifying Software Network Functions with No Verification  Expertise},
-      author = {Zastrovnykh, Arseniy and Pirelli, Solal and Iyer, Rishabh  and Rizzo, Matteo and Pedrosa, Luis and Argyraki, Katerina  and Candea, George},
-      publisher = {ACM},
-      journal = {Proceedings of the 27th ACM Symposium on Operating Systems  Principles (SOSP'19)},
-      pages = {16. 275-290},
-      venue_short = {SOSP '19},
-      year = {2019},
-      abstract = {We present the design and implementation of Vigor, a  software stack and toolchain for building and running  software network middleboxes that are guaranteed to be  correct, while preserving competitive performance and  developer productivity. Developers write the core of the  middlebox---the network function (NF)---in C, on top of a  standard packet-processing framework, putting persistent  state in data structures from Vigor's library; the Vigor  toolchain then automatically verifies that the resulting  software stack correctly implements a specification, which  is written in Python. Vigor has three key features: network  function developers need no verification expertise, and the  verification process does not require their assistance  (push-button verification); the entire software stack is  verified, down to the hardware (full-stack verification);  and verification can be done in a pay-as-you-go manner,  i.e., instead of investing upfront a lot of time in writing  and verifying a complete specification, one can specify  one-off properties in a few lines of Python and verify them  without concern for the rest. We developed five  representative NFs---a NAT, a Maglev load balancer, a  MAC-learning bridge, a firewall, and a traffic  policer---and verified with Vigor that they satisfy  standards-derived specifications, are memory-safe, and do  not crash or hang. We show that they provide competitive  performance. The Vigor framework is available at  [http://vigor.epfl.ch].},
-      url = {http://infoscience.epfl.ch/record/275534},
-      doi = {10.1145/3341301.3359647},
-}
-@article{Tsai:275441,
-      title = {Pythia: Remote Oracles for the Masses},
-      author = {Tsai, Shin-Yeh and Payer, Mathias and Zhang, Yiying},
-      publisher = {USENIX ASSOC},
-      journal = {Proceedings Of The 28th Usenix Security Symposium},
-      address = {Berkeley},
-      pages = {693-710},
-      venue_short = {USENIX Security '19},
-      year = {2019},
-      abstract = {Remote Direct Memory Access (RDMA) is a technology that  allows direct access from the network to a machine's main  memory without involving its CPU. RDMA offers low-latency,  high-bandwidth performance and low CPU utilization. While  RDMA provides massive performance boosts and has thus been  adopted by several major cloud providers, security concerns  have so far been neglected.},
-      url = {http://infoscience.epfl.ch/record/275441},
-}
+ title={Verifying Software Network Functions with No Verification  Expertise},
+ author={Zastrovnykh, Arseniy and Pirelli, Solal and Iyer, Rishabh  and Rizzo, Matteo and Pedrosa, Luis and Argyraki, Katerina  and Candea, George},
+ abstract={We present the design and implementation of Vigor, a  software stack and toolchain for building and running  software network middleboxes that are guaranteed to be  correct, while preserving competitive performance and  developer productivity. Developers write the core of the  middlebox---the network function (NF)---in C, on top of a  standard packet-processing framework, putting persistent  state in data structures from Vigor's library; the Vigor  toolchain then automatically verifies that the resulting  software stack correctly implements a specification, which  is written in Python. Vigor has three key features: network  function developers need no verification expertise, and the  verification process does not require their assistance  (push-button verification); the entire software stack is  verified, down to the hardware (full-stack verification);  and verification can be done in a pay-as-you-go manner,  i.e., instead of investing upfront a lot of time in writing  and verifying a complete specification, one can specify  one-off properties in a few lines of Python and verify them  without concern for the rest. We developed five  representative NFs---a NAT, a Maglev load balancer, a  MAC-learning bridge, a firewall, and a traffic  policer---and verified with Vigor that they satisfy  standards-derived specifications, are memory-safe, and do  not crash or hang. We show that they provide competitive  performance. The Vigor framework is available at  http://vigor.epfl.ch.},
+ url={http://infoscience.epfl.ch/record/275534},
+ venue_short={SOSP '19}}
+
+
+@article{Burow:275478,
+ title={SoK: Shining Light on Shadow Stacks},
+ author={Burow, Nathan and Zhang, Xinping and Payer, Mathias},
+ abstract={Control-Flow Hijacking attacks are the dominant attack  vector against C/C++ programs. Control-Flow Integrity (CFI)  solutions mitigate these attacks on the forward edge, i.e.,  indirect calls through function pointers and virtual calls.  Protecting the backward edge is left to stack canaries,  which are easily bypassed through information leaks. Shadow  Stacks are a fully precise mechanism for protecting  backwards edges, and should be deployed with CFI  mitigations.},
+ url={http://infoscience.epfl.ch/record/275478},
+ venue_short={IEEE Security and Privacy '19}}
+
+
+@article{Drijvers:275481,
+ title={On the Security of Two-Round Multi-Signatures},
+ author={Drijvers, Manu and Edalatnejad, Kasra and Ford, Bryan and  Kiltz, Eike and Loss, Julian and Neven, Gregory and  Stepanovs, Igors},
+ abstract={A multi-signature scheme allows a group of signers to  collaboratively sign a message, creating a single signature  that convinces a verifier that every individual signer  approved the message. The increased interest in  technologies to decentralize trust has triggered the  proposal of highly efficient two-round Schnorr-based  multi-signature schemes designed to scale up to thousands  of signers, namely BCJ by Bagherzandi et al. (CCS 2008),  MWLD by Ma et al. (DCC 2010), CoSi by Syta et al. (S&amp;P  2016), and MuSig by Maxwell et al. (ePrint 2018). In this  work, we point out serious security issues in all currently  known two-round multi-signature schemes (without pairings).  First, we prove that none of the schemes can be proved  secure without radically departing from currently known  techniques. Namely, we show that if the one-more  discrete-logarithm problem is hard, then no algebraic  reduction exists that proves any of these schemes secure  under the discrete-logarithm or one-more discrete-logarithm  problem. We point out subtle flaws in the published  security proofs of the above schemes (except CoSi, which  was not proved secure) to clarify the contradiction between  our result and the existing proofs. Next, we describe  practical sub exponential attacks on all schemes, providing  further evidence to their insecurity. Being left without  two-round multi-signature schemes, we present rnBCJ, a  variant of the BCJ scheme that we prove secure under the  discrete-logarithm assumption in the random-oracle model.  Our experiments show that rni3CJ barely affects scalability  compared to CoSi, allowing 16384 signers to collaboratively  sign a message in about 2 seconds, making it a highly  practical and provably secure alternative for large-scale  deploy tents.},
+ url={http://infoscience.epfl.ch/record/275481},
+ venue_short={IEEE Security and Privacy '19}}
+
+
diff --git a/dblpbibcloud.py b/dblpbibcloud.py
new file mode 100755
index 0000000..577bce5
--- /dev/null
+++ b/dblpbibcloud.py
@@ -0,0 +1,283 @@
+#!/usr/bin/python3
+# -*- coding: latin-1 -*-
+
+####
+####  dblpbibcloud.py
+####  based on bibcloud.py
+####
+
+# Copyright 2015-21 Ecole Polytechnique Federale Lausanne (EPFL)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+#
+# dblpbibcloud :
+#  -- all input arguments are in the code itself 
+#  -- input: set of principal investigators (lab heads) with start date
+#  -- input: set of conferences
+#  -- input: exclusion list
+
+
+
+import sys
+import os
+import xml.etree.ElementTree as ET
+import subprocess
+import time
+import locale
+import requests
+import functools
+import bibtexparser
+import itertools
+
+DEBUG = 0
+
+AUTHORS = {
+    "bugnion": {"dblp":"b/EBugnion"},
+    "larus" :{"dblp":"l/JamesRLarus"},
+    "candea" :{"dblp":"c/GeorgeCandea"},
+    "argyraki": {"dblp":"71/6861"},
+    "kashyap": {"dblp":"145/0912","year":2020},
+    "ailamaki" : {"dblp":"a/AnastassiaAilamaki"},
+    "guerraoui" :{"dblp":"g/RachidGuerraoui"},
+    "kermarrec": {"dblp":"86/676"},
+    "falsafi" : {"dblp":"f/BabakFalsafi"},
+    "payer" : {"dblp":"31/1273"},
+    "ford" : {"dblp":"f/BryanFord"}
+}
+
+
+CONFERENCES_ORG = [
+    "ASPLOS",
+    "SOSP", "OSDI", 
+    "SIGCOMM", "NSDI",  "ISCA", "MICRO", 
+    "SIGMOD", "VLDB", "PVLDB",
+    "MobiCom", "MobiSys", "SenSys", "IMC", 
+    "SIGMETRICS", 
+    "PLDI", "CCS", "SP", "USS"
+     #### not on George's orginal list
+     "Eurosys"
+     ]
+
+
+CONFERENCES = [x.upper() for x in CONFERENCES_ORG]
+DEBUG and print(CONFERENCES)
+
+
+CONFERENCES_PRETTY = {
+    "SP" : "IEEE Security and Privacy",
+    "USS" : "USENIX Security"
+}
+
+
+############
+### globals
+############
+ 
+PUBLICATIONS = {}
+KEYPUBS = []
+BIBALL = {}
+
+
+#### added for dblpbbibcloud
+
+def sort_lambda(a,b):
+    if a["year"] == b["year"]:
+        if a["conf"] < b["conf"]:
+            return -1
+        elif a["conf"] > b["conf"]:
+            return 1
+        else:
+            return 0
+    else:
+        return b["year"]-a["year"]
+
+
+def normalize_title(a):
+    x = a.find("  ")
+    if x==len(a)-1:
+        return normalize_title(a[:x])
+    elif x>=0:
+        return normalize_title(a[:x]+a[x+1:])
+    else:
+        dot = a[len(a)-1]
+        if dot == ".":
+            b = a[:len(a)-1] 
+            return b.lower()
+        else:
+            return a.lower()
+
+def print_bibentry(p,b):
+    F.write("@article{"+b["ID"]+",\n")
+    F.write(" title={"+b["title"]+"},\n")
+    F.write(" author={"+b["author"]+"},\n")
+    F.write(" abstract={"+b["abstract"]+"},\n")
+    F.write(" url={"+b["url"]+"},\n")
+    conf = p["conf"].upper()
+    if conf in CONFERENCES_PRETTY:
+        conf = CONFERENCES_PRETTY[conf]
+    if "venue_short" in b:
+        F.write(" venue_short={"+b["venue_short"]+"}}\n\n\n")
+    else:
+        F.write(" venue_short={"+conf+" '"+str(p["year"]-2000)+"}}\n\n\n")
+
+
+###################################################
+#################### main  ########################
+###################################################
+# process bib file from ARVG
+print("dbbibcloud: This is dblpbibcloud ... Use at your own risk ... see source for documentation")
+
+
+if not os.path.exists(".bibcloud"):
+    os.mkdir(".bibcloud")
+
+for author in AUTHORS:
+    print("Fetching author",author,AUTHORS[author])
+    url = "https://dblp.org/pid/" + AUTHORS[author]["dblp"] + ".xml"
+    f = requests.get(url)
+    xml = ET.ElementTree(ET.fromstring(f.text))
+    root = xml.getroot()
+    for child in root:
+        if child.tag == "r":
+            for paper in child:
+                key = paper.attrib['key']
+                if key in PUBLICATIONS:
+                    PUBLICATIONS[key]["authors"].append(author)
+                else:
+                    PUBLICATIONS[key] = {"xml":paper,"authors":[author]}
+ 
+
+for p in PUBLICATIONS:
+    x = p.split("/")
+    xml = PUBLICATIONS[p]["xml"]
+    authors = PUBLICATIONS[p]["authors"]
+    if x[1].upper() in CONFERENCES:
+        year = 0
+        title = ""
+        for c in xml:
+            if c.tag == "year":
+                year = int(c.text)
+            if c.tag == "title":
+                title = c.text
+        year==0 and sys.exit("could not find year for "+str(p))
+        title=="" and sys.exit("could not find title for "+str(p))
+        keep = 0
+        for a in authors:
+            minyear = 2019
+            if "year" in AUTHORS[a]:
+                minyear = AUTHORS[a]["year"]
+                DEBUG and print("adjuting min year for",a,minyear)
+            if year >= minyear:
+                keep = 1
+        if keep:
+            DEBUG and print("key",p,x[1],year,authors)
+            KEYPUBS.append({
+                "key": p,
+                "conf": x[1],
+                "year": year,
+                "authors": authors,
+                "title":normalize_title(title)
+            })
+
+KEYPUBS.sort(key=functools.cmp_to_key(sort_lambda))
+
+for x in KEYPUBS:
+    print("%4d %10s  %30s %20s %s"% (x["year"],x["conf"],x["authors"],x["key"],x["title"]))
+
+
+## add lab-specific file here for those who don't use infoscience
+## (e.g.,gannimo)
+for filename in ["bib/misc.bib"]:
+    with open(filename) as bibtex_file:
+        bib_db = bibtexparser.load(bibtex_file)
+        print("reading local bib file",len(bib_db.entries),"entries")
+        for x in bib_db.entries:
+            title = normalize_title(x["title"])
+            found = 0
+            for y in KEYPUBS:
+                if y["title"] == title:
+                    print("  -- found",y["key"])
+                    y["bib"] = x
+                    found = 1
+            if found==0:
+                print("   -- not found; inserted",x["venue_short"],x["year"]),
+                KEYPUBS.append({"key": "NONE",
+                "conf": x["venue_short"],
+                "year": int(x["year"]),
+                "authors": "NONE",
+                "title":title,
+                "bib":x})
+
+#sort again
+KEYPUBS.sort(key=functools.cmp_to_key(sort_lambda))
+
+
+for author in AUTHORS:
+    url = "https://infoscience.epfl.ch/search?ln=en&p="+author+"&f=&rm=&ln=en&sf=&so=d&rg=1000&of=btex&fct__1=Conference+Papers"
+    f = requests.get(url)
+    bib_db = bibtexparser.loads(f.text)
+    print("infoscience",author,len(bib_db.entries),"entries")
+    for x in bib_db.entries:
+        title = normalize_title(x["title"])
+        DEBUG and print("infoscience",author,title)
+        if title in BIBALL:
+            if (BIBALL[title]["ID"] == x["ID"]):
+                DEBUG and print(" === multiple author for:"+title)
+            else:
+                print()
+                print("!!!! Duplicateentries for title (1/2)"+title)
+                print(BIBALL[title])
+                print("!!!! Duplicate  entries for title (2/2) "+title)
+                print(x)
+        else:
+            BIBALL[title] = x
+
+
+
+
+F = open("bib/pubs.bib","w")
+F.write("%% DO NOT EDIT\n")
+F.write("%% Generated by dblpbibcloud.py\n")
+for p in KEYPUBS:
+    key = p["key"]
+    title = p["title"]
+    if title in BIBALL:
+        DEBUG and print("  match for",key)
+        if "bib" in p:
+            # also found locally ... which should not be the case
+            print()
+            print()
+            print("REMOVE from bib/misc.pub",p["bib"])
+
+        if "abstract" in BIBALL[title]:
+            print_bibentry(p,BIBALL[title])
+        else:
+            print("No abstract in infoscience for ",p,BIBALL[title])
+    elif "bib" in p:
+        DEBUG and print("Found locally")
+        print_bibentry(p,p["bib"])
+    else:
+        print("nomatch for",p)
+
+F.close()
+sys.exit(0)
+ 
+
+