diff --git a/Makefile b/Makefile index 0ea52ee..523a475 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ _includes/pubs.html: bib/pubs.bib bib/publications.tmpl $(BIBBLE) $+ > $@ build: _includes/pubs.html + ./dblpbibcloud.py jekyll build # you can configure these at the shell, e.g.: diff --git a/bib/misc.bib b/bib/misc.bib new file mode 100644 index 0000000..930dbfd --- /dev/null +++ b/bib/misc.bib @@ -0,0 +1,125 @@ +%% +%% This file is editable +%% order is unimporant + + +@article{Sirin:284817, + title = {Micro-architectural Analysis of In-memory OLTP: Revisited}, + author = {Sirin, Utku and Tözün, Pınar and Porobic, Danica and Yasin, Ahmad and Ailamaki, Anastasia}, + journal = {The VLDB Journal}, + number = {4}, + volume = {30}, + venue_short = {VLDBJ '21}, + year = {2021}, + abstract = {Micro-architectural behavior of traditional disk-based online transaction processing (OLTP) systems has been investigated extensively over the past couple of decades. Results show that traditional OLTP systems mostly under-utilize the available micro-architectural resources. In-memory OLTP systems, on the other hand, process all the data in main-memory and, therefore, can omit the buffer pool. Furthermore, they usually adopt more lightweight concurrency control mechanisms, cache-conscious data structures, and cleaner codebases since they are usually designed from scratch. Hence, we expect significant differences in micro-architectural behavior when running OLTP on platforms optimized for in-memory processing as opposed to disk-based database systems. In particular, we expect that in-memory systems exploit micro-architectural features such as instruction and data caches significantly better than disk-based systems. This paper sheds light on the micro-architectural behavior of in-memory database systems by analyzing and contrasting it to the behavior of disk-based systems when running OLTP workloads. The results show that, despite all the design changes, in-memory OLTP exhibits very similar micro-architectural behavior to disk-based OLTP: more than half of the execution time goes to memory stalls where instruction cache misses or the long-latency data misses from the last-level cache (LLC) are the dominant factors in the overall execution time. Even though ground-up designed in-memory systems can eliminate the instruction cache misses, the reduction in instruction stalls amplifies the impact of LLC data misses. As a result, only 30% of the CPU cycles are used to retire instructions, and 70% of the CPU cycles are wasted to stalls for both traditional disk-based and new generation in-memory OLTP.}, + url = {http://infoscience.epfl.ch/record/284817}, + doi = {10.1007/s00778-021-00663-8}, +} + +@article{Dauterman:287869, + title = {SafetyPin: Encrypted Backups with Human-Memorable Secrets}, + author = {Dauterman, Emma and Corrigan-Gibbs, Henry and Mazieres, David}, + publisher = {USENIX ASSOC}, + journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)}, + address = {Berkeley}, + pages = {1121-1138}, + venue_short = {OSDI '20}, + year = {2020}, + abstract = {We present the design and implementation of SafetyPin, a system for encrypted mobile-device backups. Like existing cloud-based mobile-backup systems, including those of Apple and Google, SafetyPin requires users to remember only a short PIN and defends against brute-force PIN-guessing attacks using hardware security protections. Unlike today's systems, SafetyPin splits trust over a cluster of hardware security modules (HSMs) in order to provide security guarantees that scale with the number of HSMs. In this way, SafetyPin protects backed-up user data even against an attacker that can adaptively compromise many of the system's constituent HSMs. SafetyPin provides this protection without sacrificing scalability or fault tolerance. Decentralizing trust while respecting the resource limits of today's HSMs requires a synthesis of systems-design principles and cryptographic tools. We evaluate SafetyPin on a cluster of 100 low-cost HSMs and show that a SafetyPin-protected recovery takes 1.01 seconds. To process 1B recoveries a venue_short, we estimate that a SafetyPin deployment would need 3,100 low-cost HSMs.}, + url = {http://infoscience.epfl.ch/record/287869}, +} + + +@article{Cohn-Gordon:287859, + title = {DELF: Safeguarding Deletion Correctness in Online Social Networks}, + author = {Cohn-Gordon, Katriel and Damaskinos, Georgios and Neto, Divino and Cordova, Shi and Reitz, Benoit and Strahs, Benjamin and Obenshain, Daniel and Pearce, Paul and Papagiannis, Loannis}, + publisher = {USENIX ASSOC}, + journal = {Proceedings Of The 29th Usenix Security Symposium}, + address = {Berkeley}, + pages = {1057-1074}, + venue_short = {USENIX Security '20}, + year = {2020}, + abstract = {Deletion is a core facet of Online Social Networks (OSNs). For users, deletion is a tool to remove what they have shared and control their data. For OSNs, robust deletion is both an obligation to their users and a risk when developer mistakes inevitably occur. While developers are effective at identifying high-level deletion requirements in products (e.g., users should be able to delete posted photos), they are less effective at mapping high-level requirements into concrete operations (e.g., deleting all relevant items in data stores). Without framework support, developer mistakes lead to violations of users' privacy, such as retaining data that should be deleted, deleting the wrong data, and exploitable vulnerabilities.}, + url = {http://infoscience.epfl.ch/record/287859}, +} + + +@article{Anadiotis:282822, + title = {A System Design for Elastically Scaling Transaction Processing Engines in Virtualized Servers}, + author = {Anadiotis, Angelos-Christos and Appuswamy, Raja and Ailamaki, Anastasia and Bronshtein, Ilan and Avni, Hillel and Dominguez-Sal, David and Goikhman, Shay and Levy, Eliezer}, + publisher = {ASSOC COMPUTING MACHINERY}, + journal = {Proceedings of the VLDB Endowment}, + address = {New York}, + number = {12}, + volume = {13}, + pages = {3085-3098}, + venue_short = {VLDB '20}, + year = {2020}, + abstract = {Online Transaction Processing (OLTP) deployments are migrating from on-premise to cloud settings in order to exploit the elasticity of cloud infrastructure which allows them to adapt to workload variations. However, cloud adaptation comes at the cost of redesigning the engine, which has led to the introduction of several, new, cloud-based transaction processing systems mainly focusing on: (i) the transaction coordination protocol, (ii) the data partitioning strategy, and, (iii) the resource isolation across multiple tenants. As a result, standalone OLTP engines cannot be easily deployed with an elastic setting in the cloud and they need to migrate to another, specialized deployment.}, + url = {http://infoscience.epfl.ch/record/282822}, + doi = {10.14778/3415478.3415536}, +} +@article{Sirin:280879, + title = {Micro-architectural Analysis of OLAP: Limitations and Opportunities}, + author = {Sirin, Utku and Ailamaki, Anastasia}, + publisher = {ASSOC COMPUTING MACHINERY}, + journal = {Proceedings of the VLDB Endowment},, + address = {New York}, + number = {6}, + volume = {13}, + pages = {840-853}, + venue_short = {VLDB '20}, + year = {2020}, + abstract = {Understanding micro-architectural behavior is important for efficiently using hardware resources. Recent work has shown that in-memory online transaction processing (OLTP) systems severely underutilize their core micro-architecture resources [29]. Whereas, online analytical processing (OLAP) workloads exhibit a completely different computing pattern. OLAP workloads are read-only, bandwidth-intensive, and include various data access patterns. With the rise of column-stores, they run on high-performance engines that are tightly optimized for modern hardware. Consequently, micro-architectural behavior of modern OLAP systems remains unclear.}, + url = {http://infoscience.epfl.ch/record/280879}, + doi = {10.14778/3380750.3380755}, +} +@article{Olma:275745, + title = {Adaptive Partitioning and Indexing for In-situ Query Processing}, + author = {Olma, Matthaios and Karpathiotakis, Manos and Alagiannis, Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia}, + publisher = {SPRINGER}, + journal = {The VLDB Journal}, + address = {New York}, + number = {1}, + volume = {29}, + pages = {569-591}, + venue_short = {VLDBJ '20}, + year = {2020}, + abstract = {The constant flux of data and queries alike has been pushing the boundaries of data analysis systems. The increasing size of raw data files has made data loading an expensive operation that delays the data-to-insight time. To alleviate the loading cost, in situ query processing systems operate directly over raw data and offer instant access to data. At the same time, analytical workloads have increasing number of queries. Typically, each query focuses on a constantly shifting-yet small-range. As a result, minimizing the workload latency requires the benefits of indexing in in situ query processing. In this paper, we present an online partitioning and indexing scheme, along with a partitioning and indexing tuner tailored for in situ querying engines. The proposed system design improves query execution time by taking into account user query patterns, to (i) partition raw data files logically and (ii) build lightweight partition-specific indexes for each partition. We build an in situ query engine called Slalom to showcase the impact of our design. Slalom employs adaptive partitioning and builds non-obtrusive indexes in different partitions on-the-fly based on lightweight query access pattern monitoring. As a result of its lightweight nature, Slalom achieves efficient query processing over raw data with minimal memory consumption. Our experimentation with both microbenchmarks and real-life workloads shows that Slalom outperforms state-of-the-art in situ engines and achieves comparable query response times with fully indexed DBMS, offering lower cumulative query execution times for query workloads with increasing size and unpredictable access patterns.}, + url = {http://infoscience.epfl.ch/record/275745}, + doi = {10.1007/s00778-019-00580-x}, +} + + +@article{Oh:276919, + title = {Linebacker: Preserving Victim Cache Lines in Idle Register Files of GPUs}, + author = {Oh, Yunho and Koo, Gunjae and Annavaram, Murali and Ro, Won Woo}, + publisher = {ASSOC COMPUTING MACHINERY}, + journal = {Proceedings Of The 2019 46Th International Symposium On Computer Architecture (ISCA'19)}, + address = {New York}, + pages = {183-196}, + venue_short = {ISCA '19}, + year = {2019}, + abstract = {Modern GPUs suffer from cache contention due to the limited cache size that is shared across tens of concurrently running warps. To increase the per-warp cache size prior techniques proposed warp throttling which limits the number of active warps. Warp throttling leaves several registers to be dynamically unused whenever a warp is throttled. Given the stringent cache size limitation in GPUs this work proposes a new cache management technique named Linebacker (LB) that improves GPU performance by utilizing idle register file space as victim cache space. Whenever a CTA becomes inactive, linebacker backs up the registers of the throttled CTA to the off-chip memory. Then, linebacker utilizes the corresponding register file space as victim cache space. If any load instruction finds data in the victim cache line, the data is directly copied to the destination register through a simple register-register move operation. To further improve the efficiency of victim cache linebacker allocates victim cache space only to a select few load instructions that exhibit high data locality. Through a careful design of victim cache indexing and management scheme linebacker provides 29.0% of speedup compared to the previously proposed warp throttling techniques.}, + url = {http://infoscience.epfl.ch/record/276919}, + doi = {10.1145/3307650.3322222}, +} + + + +@article{Shamis:273429, + title = {Fast General Distributed Transactions with Opacity}, + author = {Shamis, Alex and Renzelmann, Matthew and Novakovic, Stanko and Chatzopoulos, Georgios and Dragojevic, Aleksandar and Narayanan, Dushyanth and Castro, Miguel}, + publisher = {ASSOC COMPUTING MACHINERY}, + journal = {Proceedings Of The 2019 International Conference On Management Of Data (SIGMOD'19)}, + address = {New York}, + series = {International Conference on Management of Data}, + pages = {433-448}, + venue_short = {SIGMOD '19}, + year = {2019}, + abstract = {Transactions can simplify distributed applications by hiding data distribution, concurrency, and failures from the application developer. Ideally the developer would see the abstraction of a single large machine that runs transactions sequentially and never fails. This requires the transactional subsystem to provide opacity (strict serializability for both committed and aborted transactions), as well as transparent fault tolerance with high availability. As even the best abstractions are unlikely to be used if they perform poorly, the system must also provide high performance. Existing distributed transactional designs either weaken this abstraction or are not designed for the best performance within a data center. This paper extends the design of FaRM - which provides strict serializability only for committed transactions - to provide opacity while maintaining FaRM's high throughput, low latency, and high availability within a modern data center. It uses timestamp ordering based on real time with clocks synchronized to within tens of microseconds across a cluster, and a failover protocol to ensure correctness across clock master failures. FaRM with opacity can commit 5.4 million neworder transactions per second when running the TPC-C transaction mix on 90 machines with 3-way replication.}, + url = {http://infoscience.epfl.ch/record/273429}, + doi = {10.1145/3299869.3300069}, +} + + diff --git a/bib/pubs.bib b/bib/pubs.bib index bfed4d2..7a99105 100644 --- a/bib/pubs.bib +++ b/bib/pubs.bib @@ -1,366 +1,282 @@ - - +%% DO NOT EDIT +%% Generated by dblpbibcloud.py @article{Ghosn:283299, - title = {Enclosure: Language-Based Restriction of Untrusted Libraries}, - author = {Ghosn, Adrien and Kogias, Marios and Payer, Mathias and Larus, James and Bugnion, Edouard}, - pages = {13}, - year = {2021}, - journal = {Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems}, - abstract = {Programming languages and systems have failed to address the security implications of the increasingly frequent use of public libraries to construct modern software. Most languages provide tools and online repositories to publish, import, and use libraries; however, this double-edged sword can incorporate a large quantity of unknown, unchecked, and unverified code into an application. The risk is real, as demonstrated by malevolent actors who have repeatedly inserted malware into popular open-source libraries. This paper proposes a solution: enclosures, a new programming language construct for library isolation that provides a developer with fine-grain control over the resources that a library can access, even for libraries with complex inter-library dependencies. The programming abstraction is language-independent and could be added to most languages. These languages would then be able to take advantage of hardware isolation mechanisms that are effective across language boundaries. The enclosure policies are enforced at run time by LitterBox, a language-independent framework that uses hardware mechanisms to provide uniform and robust isolation guarantees, even for libraries written in unsafe languages. LitterBox currently supports both Intel VT-x (with general-purpose extended page tables) and the emerging Intel Memory Protection Keys (MPK). We describe an enclosure implementation for the Go and Python languages. Our evaluation demonstrates that the Go implementation can protect sensitive data in real-world applications constructed using complex untrusted libraries with deep dependencies. It requires minimal code refactoring and incurs acceptable performance overhead. The Python implementation demonstrates LitterBox’s ability to support dynamic languages.}, - url = {http://infoscience.epfl.ch/record/283299}, - venue_short = {ASPLOS '21}, - -} + title={Enclosure: Language-Based Restriction of Untrusted Libraries}, + author={Ghosn, Adrien and Kogias, Marios and Payer, Mathias and Larus, James and Bugnion, Edouard}, + abstract={Programming languages and systems have failed to address the security implications of the increasingly frequent use of public libraries to construct modern software. Most languages provide tools and online repositories to publish, import, and use libraries; however, this double-edged sword can incorporate a large quantity of unknown, unchecked, and unverified code into an application. The risk is real, as demonstrated by malevolent actors who have repeatedly inserted malware into popular open-source libraries. This paper proposes a solution: enclosures, a new programming language construct for library isolation that provides a developer with fine-grain control over the resources that a library can access, even for libraries with complex inter-library dependencies. The programming abstraction is language-independent and could be added to most languages. These languages would then be able to take advantage of hardware isolation mechanisms that are effective across language boundaries. The enclosure policies are enforced at run time by LitterBox, a language-independent framework that uses hardware mechanisms to provide uniform and robust isolation guarantees, even for libraries written in unsafe languages. LitterBox currently supports both Intel VT-x (with general-purpose extended page tables) and the emerging Intel Memory Protection Keys (MPK). We describe an enclosure implementation for the Go and Python languages. Our evaluation demonstrates that the Go implementation can protect sensitive data in real-world applications constructed using complex untrusted libraries with deep dependencies. It requires minimal code refactoring and incurs acceptable performance overhead. The Python implementation demonstrates LitterBox’s ability to support dynamic languages.}, + url={http://infoscience.epfl.ch/record/283299}, + venue_short={ASPLOS '21}} @article{Ustiugov:283308, - title = {Benchmarking, Analysis, and Optimization of Serverless Function Snapshots}, - author = {Ustiugov, Dmitrii and Petrov, Plamen and Kogias, Marios and Bugnion, Edouard and Grot, Boris}, - pages = {14}, - year = {2021}, - journal = {Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems}, - - abstract = {Serverless computing has seen rapid adoption due to its high scalability and flexible, pay-as-you-go billing model. In serverless, developers structure their services as a collection of functions, sporadically invoked by various events like clicks. High inter-arrival time variability of function invocations motivates the providers to start new function instances upon each invocation, leading to significant cold-start delays that degrade user experience. To reduce cold-start latency, the industry has turned to snapshotting, whereby an image of a fully-booted function is stored on disk, enabling a faster invocation compared to booting a function from scratch. This work introduces vHive, an open-source framework for serverless experimentation with the goal of enabling researchers to study and innovate across the entire serverless stack. Using vHive, we characterize a state-of-the-art snapshot-based serverless infrastructure, based on industry-leading Containerd orchestration framework and Firecracker hypervisor technologies. We find that the execution time of a function started from a snapshot is 95% higher, on average, than when the same function is memory- resident. We show that the high latency is attributable to frequent page faults as the function’s state is brought from disk into guest memory one page at a time. Our analysis further reveals that funccloud computing, datacenters, serverless, virtualization, snapshotstions access the same stable working set of pages across different invocations of the same function. By leveraging this insight, we build REAP, a light-weight software mechanism for serverless hosts that records functions’ stable working set of guest memory pages and proactively prefetches it from disk into memory. Compared to baseline snapshotting, REAP slashes the cold-start delays by 3.7×, on average.}, - url = {http://infoscience.epfl.ch/record/283308}, - venue_short = {ASPLOS '21}, - -} + title={Benchmarking, Analysis, and Optimization of Serverless Function Snapshots}, + author={Ustiugov, Dmitrii and Petrov, Plamen and Kogias, Marios and Bugnion, Edouard and Grot, Boris}, + abstract={Serverless computing has seen rapid adoption due to its high scalability and flexible, pay-as-you-go billing model. In serverless, developers structure their services as a collection of functions, sporadically invoked by various events like clicks. High inter-arrival time variability of function invocations motivates the providers to start new function instances upon each invocation, leading to significant cold-start delays that degrade user experience. To reduce cold-start latency, the industry has turned to snapshotting, whereby an image of a fully-booted function is stored on disk, enabling a faster invocation compared to booting a function from scratch. This work introduces vHive, an open-source framework for serverless experimentation with the goal of enabling researchers to study and innovate across the entire serverless stack. Using vHive, we characterize a state-of-the-art snapshot-based serverless infrastructure, based on industry-leading Containerd orchestration framework and Firecracker hypervisor technologies. We find that the execution time of a function started from a snapshot is 95% higher, on average, than when the same function is memory- resident. We show that the high latency is attributable to frequent page faults as the function’s state is brought from disk into guest memory one page at a time. Our analysis further reveals that funccloud computing, datacenters, serverless, virtualization, snapshotstions access the same stable working set of pages across different invocations of the same function. By leveraging this insight, we build REAP, a light-weight software mechanism for serverless hosts that records functions’ stable working set of guest memory pages and proactively prefetches it from disk into memory. Compared to baseline snapshotting, REAP slashes the cold-start delays by 3.7×, on average.}, + url={http://infoscience.epfl.ch/record/283308}, + venue_short={ASPLOS '21}} -@article{Primorac:283315, - title = {When to Hedge in Interactive Services}, - author = {Primorac, Mia and Argyraki, Katerina and Bugnion, Edouard}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 18Th Usenix Symposium On Networked System Design And Implementation}, - address = {Berkeley}, - pages = {15. 373-388}, - year = {2021}, - abstract = {In online data-intensive (OLDI) services, each client request typically executes on multiple servers in parallel; as a result, “system hiccups”, although rare within a single server, can interfere with many client requests and cause violations of service-level objectives. Service providers have long been fighting this “tail at scale” problem through “hedging”, i.e., issuing redundant queries to mask system hiccups. This, however, can potentially cause congestion that is more detrimental to tail latency than the hiccups themselves. This paper asks: when does it make sense to hedge in OLDI services, and how can we hedge enough to mask system hiccups but not as much as to cause congestion? First, we show that there are many realistic scenarios where hedging can have no benefit—where any hedging-based scheduling policy, including the state-of-the-art, yields no latency reduction compared to optimal load balancing without hedging. Second, we propose LÆDGE, a scheduling policy that combines optimal load balancing with work-conserving hedging, and evaluate it in an AWS cloud deployment. We show that LÆDGE strikes the right balance: first, unlike the state of the art, it never causes unnecessary congestion; second, it performs close to an ideal scheduling policy, improving the 99th percentile latency by as much as 49%, measured on 60% system utilization—without any difficult parameter training as found in the state of the art.}, - url = {http://infoscience.epfl.ch/record/283315}, - venue_short = {NSDI '21}, - -} @article{Gupta:285466, - title = {Rebooting Virtual Memory with Midgard}, - author = {Gupta, Siddharth and Bhattacharyya, Atri and Oh, Yunho and Bhattacharjee, Abhishek and Falsafi, Babak and Payer, Mathias}, - journal = {Proceedings of the 48th International Symposium on Computer Architecture (ISCA'21)}, - pages = {14}, - venue_short = {ISCA '21}, - year = {2021}, - abstract = {Computer systems designers are building cache hierarchies with higher capacity to capture the ever-increasing working sets of modern workloads. Cache hierarchies with higher capacity improve system performance but shift the performance bottleneck to address translation. We propose Midgard, an intermediate address space between the virtual and the physical address spaces, to mitigate address translation overheads without program-level changes. Midgard leverages the operating system concept of virtual memory areas (VMAs) to realize a single Midgard address space where VMAs of all processes can be uniquely mapped. The Midgard address space serves as the namespace for all data in a coherence domain and the cache hierarchy. Because real-world workloads use far fewer VMAs than pages to represent their virtual address space, virtual to Midgard translation is achieved with hardware structures that are much smaller than TLB hierarchies. Costlier Midgard to physical address translations are needed only on LLC misses, which become much less frequent with larger caches. As a consequence, Midgard shows that instead of amplifying address translation overheads, memory hierarchies with large caches can reduce address translation overheads. Our evaluation shows that Midgard achieves only 5% higher address translation overhead as compared to traditional TLB hierarchies for 4KB pages when using a 16MB aggregate LLC. Midgard also breaks even with traditional TLB hierarchies for 2MB pages when using a 256MB aggregate LLC. For cache hierarchies with higher capacity, Midgard's address translation overhead drops to near zero as secondary and tertiary data working sets fit in the LLC, while traditional TLBs suffer even higher degrees of address translation overhead.}, - url = {http://infoscience.epfl.ch/record/285466}, - doi = {10.1109/ISCA52012.2021.00047}, -} + title={Rebooting Virtual Memory with Midgard}, + author={Gupta, Siddharth and Bhattacharyya, Atri and Oh, Yunho and Bhattacharjee, Abhishek and Falsafi, Babak and Payer, Mathias}, + abstract={Computer systems designers are building cache hierarchies with higher capacity to capture the ever-increasing working sets of modern workloads. Cache hierarchies with higher capacity improve system performance but shift the performance bottleneck to address translation. We propose Midgard, an intermediate address space between the virtual and the physical address spaces, to mitigate address translation overheads without program-level changes. Midgard leverages the operating system concept of virtual memory areas (VMAs) to realize a single Midgard address space where VMAs of all processes can be uniquely mapped. The Midgard address space serves as the namespace for all data in a coherence domain and the cache hierarchy. Because real-world workloads use far fewer VMAs than pages to represent their virtual address space, virtual to Midgard translation is achieved with hardware structures that are much smaller than TLB hierarchies. Costlier Midgard to physical address translations are needed only on LLC misses, which become much less frequent with larger caches. As a consequence, Midgard shows that instead of amplifying address translation overheads, memory hierarchies with large caches can reduce address translation overheads. Our evaluation shows that Midgard achieves only 5% higher address translation overhead as compared to traditional TLB hierarchies for 4KB pages when using a 16MB aggregate LLC. Midgard also breaks even with traditional TLB hierarchies for 2MB pages when using a 256MB aggregate LLC. For cache hierarchies with higher capacity, Midgard's address translation overhead drops to near zero as secondary and tertiary data working sets fit in the LLC, while traditional TLBs suffer even higher degrees of address translation overhead.}, + url={http://infoscience.epfl.ch/record/285466}, + venue_short={ISCA '21}} + + @article{PourhabibiZarandi:288432, - title = {Cerebros: Evading the RPC Tax in Datacenters}, - author = {Pourhabibi Zarandi, Arash and Sutherland, Mark Johnathon and Daglis, Alexandros and Falsafi, Babak}, - journal = {Proceedings of the 54th International Symposium on Microarchitecture (MICRO'21)}, - pages = {407-420}, - venue_short = {MICRO '21}, - year = {2021}, - abstract = {The emerging paradigm of microservices decomposes online services into fine-grained software modules frequently communicating over the datacenter network, often using Remote Procedure Calls (RPCs). Ongoing advancements in the network stack have exposed the RPC layer itself as a bottleneck, that we show accounts for 40–90% of a microservice's total execution cycles. We break down the underlying modules that comprise production RPC layers and demonstrate, based on prior evidence, that CPUs can only expect limited improvements for such tasks, mandating a shift to hardware to remove the RPC layer as a limiter of microservice performance. Although recently proposed accelerators can efficiently handle a portion of the RPC layer, their overall benefit is limited by unnecessary CPU involvement, which occurs because the accelerators are architected as co-processors under the CPU's control. Instead, we show that conclusively removing the RPC layer bottleneck requires all of the RPC layer's modules to be executed by a NIC-attached hardware accelerator. We introduce Cerebros, a dedicated RPC processor that executes the Apache Thrift RPC layer and acts as an intermediary stage between the NIC and the microservice running on the CPU. Our evaluation using the DeathStarBench microservice suite shows that Cerebros reduces the CPU cycles spent in the RPC layer by 37–64×, yielding a 1.8–14× reduction in total cycles expended per microservice request.}, - url = {http://infoscience.epfl.ch/record/288432}, - doi = {10.1145/3466752.3480055}, -} + title={Cerebros: Evading the RPC Tax in Datacenters}, + author={Pourhabibi Zarandi, Arash and Sutherland, Mark Johnathon and Daglis, Alexandros and Falsafi, Babak}, + abstract={The emerging paradigm of microservices decomposes online services into fine-grained software modules frequently communicating over the datacenter network, often using Remote Procedure Calls (RPCs). Ongoing advancements in the network stack have exposed the RPC layer itself as a bottleneck, that we show accounts for 40–90% of a microservice's total execution cycles. We break down the underlying modules that comprise production RPC layers and demonstrate, based on prior evidence, that CPUs can only expect limited improvements for such tasks, mandating a shift to hardware to remove the RPC layer as a limiter of microservice performance. Although recently proposed accelerators can efficiently handle a portion of the RPC layer, their overall benefit is limited by unnecessary CPU involvement, which occurs because the accelerators are architected as co-processors under the CPU's control. Instead, we show that conclusively removing the RPC layer bottleneck requires all of the RPC layer's modules to be executed by a NIC-attached hardware accelerator. We introduce Cerebros, a dedicated RPC processor that executes the Apache Thrift RPC layer and acts as an intermediary stage between the NIC and the microservice running on the CPU. Our evaluation using the DeathStarBench microservice suite shows that Cerebros reduces the CPU cycles spent in the RPC layer by 37–64×, yielding a 1.8–14× reduction in total cycles expended per microservice request.}, + url={http://infoscience.epfl.ch/record/288432}, + venue_short={MICRO '21}} + + @article{DrumondLagesDeOliveira:288715, - title = {Equinox: Training (for Free) on a Custom Inference Accelerator}, - author = {Drumond Lages De Oliveira, Mario Paulo and Coulon, Louis and Pourhabibi Zarandi, Arash and Yüzügüler, Ahmet Caner and Falsafi, Babak and Jaggi, Martin}, - publisher = {ACM}, - journal = {Proceedings of the 54th International Symposium on Microarchitecture (MICRO'21)}, - venue_short = {MICRO '21}, - year = {2021}, - abstract = {DNN inference accelerators executing online services exhibit low average loads because of service demand variability, leading to poor resource utilization. Unfortunately, reclaiming idle inference cycles is difficult as other workloads can not execute on a custom accelerator. With recent proposals for the use of fixed-point arithmetic in training, there are opportunities for training services to piggyback on inference accelerators. We make the observation that a key challenge in doing so is maintaining service-level latency constraints for inference. We show that relaxing latency constraints in an inference accelerator with ALU arrays that are batching-optimized achieves near-optimal throughput for a given area and power envelope while maintaining inference services' tail latency goals. We present Equinox, a custom inference accelerator designed to piggyback training. Equinox employs a uniform arithmetic encoding to accommodate inference and training and a priority hardware scheduler with adaptive batching that interleaves training during idle inference cycles. For a500𝜇𝑠 inference service time constraint, Equinox achieves 6.67× higher throughput than a latency-optimal inference accelerator. Despite not being optimized for training services, Equinox achieves up to 78% of the throughput of a dedicated training accelerator that saturates the available compute resources and DRAM bandwidth. Finally, Equinox’s controller logic incurs less than 1% power and area overhead, while the uniform encoding (to enable training) incurs 13% power and 4% area overhead compared to a fixed-point inference accelerator.}, - url = {http://infoscience.epfl.ch/record/288715}, - doi = {10.1145/3466752.3480057}, -} + title={Equinox: Training (for Free) on a Custom Inference Accelerator}, + author={Drumond Lages De Oliveira, Mario Paulo and Coulon, Louis and Pourhabibi Zarandi, Arash and Yüzügüler, Ahmet Caner and Falsafi, Babak and Jaggi, Martin}, + abstract={DNN inference accelerators executing online services exhibit low average loads because of service demand variability, leading to poor resource utilization. Unfortunately, reclaiming idle inference cycles is difficult as other workloads can not execute on a custom accelerator. With recent proposals for the use of fixed-point arithmetic in training, there are opportunities for training services to piggyback on inference accelerators. We make the observation that a key challenge in doing so is maintaining service-level latency constraints for inference. We show that relaxing latency constraints in an inference accelerator with ALU arrays that are batching-optimized achieves near-optimal throughput for a given area and power envelope while maintaining inference services' tail latency goals. We present Equinox, a custom inference accelerator designed to piggyback training. Equinox employs a uniform arithmetic encoding to accommodate inference and training and a priority hardware scheduler with adaptive batching that interleaves training during idle inference cycles. For a500𝜇𝑠 inference service time constraint, Equinox achieves 6.67× higher throughput than a latency-optimal inference accelerator. Despite not being optimized for training services, Equinox achieves up to 78% of the throughput of a dedicated training accelerator that saturates the available compute resources and DRAM bandwidth. Finally, Equinox’s controller logic incurs less than 1% power and area overhead, while the uniform encoding (to enable training) incurs 13% power and 4% area overhead compared to a fixed-point inference accelerator.}, + url={http://infoscience.epfl.ch/record/288715}, + venue_short={MICRO '21}} + + +@article{Primorac:283315, + title={When to Hedge in Interactive Services}, + author={Primorac, Mia and Argyraki, Katerina and Bugnion, Edouard}, + abstract={In online data-intensive (OLDI) services, each client request typically executes on multiple servers in parallel; as a result, “system hiccups”, although rare within a single server, can interfere with many client requests and cause violations of service-level objectives. Service providers have long been fighting this “tail at scale” problem through “hedging”, i.e., issuing redundant queries to mask system hiccups. This, however, can potentially cause congestion that is more detrimental to tail latency than the hiccups themselves. This paper asks: when does it make sense to hedge in OLDI services, and how can we hedge enough to mask system hiccups but not as much as to cause congestion? First, we show that there are many realistic scenarios where hedging can have no benefit—where any hedging-based scheduling policy, including the state-of-the-art, yields no latency reduction compared to optimal load balancing without hedging. Second, we propose LÆDGE, a scheduling policy that combines optimal load balancing with work-conserving hedging, and evaluate it in an AWS cloud deployment. We show that LÆDGE strikes the right balance: first, unlike the state of the art, it never causes unnecessary congestion; second, it performs close to an ideal scheduling policy, improving the 99th percentile latency by as much as 49%, measured on 60% system utilization—without any difficult parameter training as found in the state of the art.}, + url={http://infoscience.epfl.ch/record/283315}, + venue_short={NSDI '21}} + + @article{Bhardwaj:289218, - title = {NrOS: Effective Replication and Sharing in an Operating System}, - author = {Bhardwaj, Ankit and Kulkarni, Chinmay and Achermann, Reto and Calciu, Irina and Kashyap, Sanidhya and Stutsman, Ryan and Tai, Amy and Zellweger, Gerd}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 15Th Usenix Symposium On Operating Systems Design And Implementation (OSDI'21)}, - address = {Berkeley}, - pages = {295-312}, - venue_short = {OSDI '21}, - year = {2021}, - abstract = {Writing a correct operating system kernel is notoriously hard. Kernel code requires manual memory management and type-unsafe code and must efficiently handle complex, asynchronous events. In addition, increasing CPU core counts further complicate kernel development. Typically, monolithic kernels share state across cores and rely on one-off synchronization patterns that are specialized for each kernel structure or subsystem. Hence, kernel developers are constantly refining synchronization within OS kernels to improve scalability at the risk of introducing subtle bugs.}, - url = {http://infoscience.epfl.ch/record/289218}, -} + title={NrOS: Effective Replication and Sharing in an Operating System}, + author={Bhardwaj, Ankit and Kulkarni, Chinmay and Achermann, Reto and Calciu, Irina and Kashyap, Sanidhya and Stutsman, Ryan and Tai, Amy and Zellweger, Gerd}, + abstract={Writing a correct operating system kernel is notoriously hard. Kernel code requires manual memory management and type-unsafe code and must efficiently handle complex, asynchronous events. In addition, increasing CPU core counts further complicate kernel development. Typically, monolithic kernels share state across cores and rely on one-off synchronization patterns that are specialized for each kernel structure or subsystem. Hence, kernel developers are constantly refining synchronization within OS kernels to improve scalability at the risk of introducing subtle bugs.}, + url={http://infoscience.epfl.ch/record/289218}, + venue_short={OSDI '21}} + + @article{Sioulas:285140, - title = {Scalable Multi-Query Execution using Reinforcement Learning}, - author = {Sioulas, Panagiotis and Ailamaki, Anastasia}, - publisher = {Association for Computing Machinery}, - journal = {Proceedings of the 2021 International Conference on Management of Data (SIGMOD'21)}, - address = {New York}, - pages = {13}, - venue_short = {SIGMOD '21}, - year = {2021}, - abstract = {The growing demand for data-intensive decision support and the migration to multi-tenant infrastructures put databases under the stress of high analytical query load. The requirement for high throughput contradicts the traditional design of query-at-a-time databases that optimize queries for efficient serial execution. Sharing work across queries presents an opportunity to reduce the total cost of processing and therefore improve throughput with increasing query load. Systems can share work either by assessing all opportunities and restructuring batches of queries ahead of execution, or by inspecting opportunities in individual incoming queries at runtime: the former strategy scales poorly to large query counts, as it requires expensive sharing-aware optimization, whereas the latter detects only a subset of the opportunities. Both strategies fail to minimize the cost of processing for large and ad-hoc workloads. This paper presents RouLette, a specialized intelligent engine for multi-query execution that addresses, through runtime adaptation, the shortcomings of existing work-sharing strategies. RouLette scales by replacing sharing-aware optimization with adaptive query processing, and it chooses opportunities to explore and exploit by using reinforcement learning. RouLette also includes optimizations that reduce the adaptation overhead. RouLette increases throughput by 1.6-28.3x, compared to a state-of-the-art query-at-a-time engine, and up to 6.5x, compared to sharing-enabled prototypes, for multi-query workloads based on the schema of TPC-DS.}, - url = {http://infoscience.epfl.ch/record/285140}, - doi = {10.1145/3448016.3452799}, -} -@article{KokorisKogias:287444, - title = {CALYPSO: Private Data Management for Decentralized Ledgers}, - author = {Kokoris Kogias, Eleftherios and Alp, Enis Ceyhun and Gasser, Linus and Jovanovic, Philipp Svetolik and Syta, Ewa and Ford, Bryan Alexander}, - journal = {Proceedings of the VLDB Endowment}, - number = {4}, - volume = {14}, - pages = {586-599}, - venue_short = {VLDB '21}, - year = {2021}, - abstract = {Distributed ledgers provide high availability and integrity, making them a key enabler for practical and secure computation of distributed workloads among mutually distrustful parties. Many practical applications also require strong confidentiality, however. This work enhances permissioned and permissionless blockchains with the ability to manage confidential data without forfeiting availability or decentralization. The proposed Calypso architecture addresses two orthogonal challenges confronting modern distributed ledgers: (a) enabling the auditable management of secrets and (b) protecting distributed computations against arbitrage attacks when their results depend on the ordering and secrecy of inputs. Calypso introduces on-chain secrets, a novel abstraction that enforces atomic deposition of an auditable trace whenever users access confidential data. Calypso provides user-controlled consent management that ensures revocation atomicity and accountable anonymity. To enable permissionless deployment, we introduce an incentive scheme and provide users with the option to select their preferred trustees. We evaluated our Calypso prototype with a confidential document-sharing application and a decentralized lottery. Our benchmarks show that transaction-processing latency increases linearly in terms of security (number of trustees) and is in the range of 0.2 to 8 seconds for 16 to 128 trustees.}, - url = {http://infoscience.epfl.ch/record/287444}, - doi = {10.14778/3436905.3436917}, -} + title={Scalable Multi-Query Execution using Reinforcement Learning}, + author={Sioulas, Panagiotis and Ailamaki, Anastasia}, + abstract={The growing demand for data-intensive decision support and the migration to multi-tenant infrastructures put databases under the stress of high analytical query load. The requirement for high throughput contradicts the traditional design of query-at-a-time databases that optimize queries for efficient serial execution. Sharing work across queries presents an opportunity to reduce the total cost of processing and therefore improve throughput with increasing query load. Systems can share work either by assessing all opportunities and restructuring batches of queries ahead of execution, or by inspecting opportunities in individual incoming queries at runtime: the former strategy scales poorly to large query counts, as it requires expensive sharing-aware optimization, whereas the latter detects only a subset of the opportunities. Both strategies fail to minimize the cost of processing for large and ad-hoc workloads. This paper presents RouLette, a specialized intelligent engine for multi-query execution that addresses, through runtime adaptation, the shortcomings of existing work-sharing strategies. RouLette scales by replacing sharing-aware optimization with adaptive query processing, and it chooses opportunities to explore and exploit by using reinforcement learning. RouLette also includes optimizations that reduce the adaptation overhead. RouLette increases throughput by 1.6-28.3x, compared to a state-of-the-art query-at-a-time engine, and up to 6.5x, compared to sharing-enabled prototypes, for multi-query workloads based on the schema of TPC-DS.}, + url={http://infoscience.epfl.ch/record/285140}, + venue_short={SIGMOD '21}} + + @article{Sirin:284817, - title = {Micro-architectural Analysis of In-memory OLTP: Revisited}, - author = {Sirin, Utku and Tözün, Pınar and Porobic, Danica and Yasin, Ahmad and Ailamaki, Anastasia}, - journal = {The VLDB Journal}, - number = {4}, - volume = {30}, - venue_short = {VLDBJ '21}, - year = {2021}, - abstract = {Micro-architectural behavior of traditional disk-based online transaction processing (OLTP) systems has been investigated extensively over the past couple of decades. Results show that traditional OLTP systems mostly under-utilize the available micro-architectural resources. In-memory OLTP systems, on the other hand, process all the data in main-memory and, therefore, can omit the buffer pool. Furthermore, they usually adopt more lightweight concurrency control mechanisms, cache-conscious data structures, and cleaner codebases since they are usually designed from scratch. Hence, we expect significant differences in micro-architectural behavior when running OLTP on platforms optimized for in-memory processing as opposed to disk-based database systems. In particular, we expect that in-memory systems exploit micro-architectural features such as instruction and data caches significantly better than disk-based systems. This paper sheds light on the micro-architectural behavior of in-memory database systems by analyzing and contrasting it to the behavior of disk-based systems when running OLTP workloads. The results show that, despite all the design changes, in-memory OLTP exhibits very similar micro-architectural behavior to disk-based OLTP: more than half of the execution time goes to memory stalls where instruction cache misses or the long-latency data misses from the last-level cache (LLC) are the dominant factors in the overall execution time. Even though ground-up designed in-memory systems can eliminate the instruction cache misses, the reduction in instruction stalls amplifies the impact of LLC data misses. As a result, only 30% of the CPU cycles are used to retire instructions, and 70% of the CPU cycles are wasted to stalls for both traditional disk-based and new generation in-memory OLTP.}, - url = {http://infoscience.epfl.ch/record/284817}, - doi = {10.1007/s00778-021-00663-8}, -} -@article{PourhabibiZarandi:274129, - title = {Optimus Prime: Accelerating Data Transformation in Servers}, - author = {Pourhabibi Zarandi, Arash and Gupta, Siddharth and Kassir, Hussein and Sutherland, Mark Johnathon and Tian, Zilu and Drumond Lages De Oliveira, Mario Paulo and Falsafi, Babak and Koch, Christoph}, - publisher = {ACM}, - journal = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS'20)}, - address = {New York, NY, USA}, - pages = {14. 1203-1216}, - venue_short = {ASPLOS '20}, - year = {2020}, - abstract = {Modern online services are shifting away from monolithic applications to loosely-coupled microservices because of their improved scalability, reliability, programmability and development velocity. Microservices communicating over the datacenter network require data transformation (DT) to convert messages back and forth between their internal formats. This work identifies DT as a bottleneck due to reductions in latency of the surrounding system components, namely application runtimes, protocol stacks, and network hardware. We therefore propose Optimus Prime (OP), a programmable DT accelerator that uses a novel abstraction, an in-memory schema, to represent DT operations. The schema is compatible with today's DT frameworks and enables any compliant accelerator to perform the transformations comprising a request in parallel. Our evaluation shows that OP's DT throughput matches the line rate of today's NICs and has 60x higher throughput compared to software, at a tiny fraction of the CPU's silicon area and power. We also evaluate a set of microservices running on Thrift, and show up to 30% reduction in service latency.}, - url = {http://infoscience.epfl.ch/record/274129}, - doi = {10.1145/3373376.3378501}, -} -@article{Aguilera:287833, - title = {Microsecond Consensus for Microsecond Applications}, - author = {Aguilera, Marcos K. and Ben-David, Naama and Guerraoui, Rachid and Marathe, Virendra J. and Xygkis, Athanasios and Zablotchi, Igor}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)}, - address = {Berkeley}, - pages = {599-616}, - venue_short = {OSDI '20}, - year = {2020}, - abstract = {We consider the problem of making apps fault-tolerant through replication, when apps operate at the microsecond scale, as in finance, embedded computing, and microservices apps. These apps need a replication scheme that also operates at the microsecond scale, otherwise replication becomes a burden. We propose Mu, a system that takes less than 1.3 microseconds to replicate a (small) request in memory, and less than a millisecond to fail-over the system-this cuts the replication and fail-over latencies of the prior systems by at least 61% and 90%. Mu implements bona fide state machine replication/consensus (SMR) with strong consistency for a generic app, but it really shines on microsecond apps, where even the smallest overhead is significant. To provide this performance, Mu introduces a new SMR protocol that carefully leverages RDMA. Roughly, in Mu a leader replicates a request by simply writing it directly to the log of other replicas using RDMA, without any additional communication. Doing so, however, introduces the challenge of handling concurrent leaders, changing leaders, garbage collecting the logs, and more-challenges that we address in this paper through a judicious combination of RDMA permissions and distributed algorithmic design. We implemented Mu and used it to replicate several systems: a financial exchange app called Liquibook, Redis, Memcached, and HERD [33]. Our evaluation shows that Mu incurs a small replication latency, in some cases being the only viable replication system that incurs an acceptable overhead.}, - url = {http://infoscience.epfl.ch/record/287833}, -} + title={Micro-architectural Analysis of In-memory OLTP: Revisited}, + author={Sirin, Utku and Tözün, Pınar and Porobic, Danica and Yasin, Ahmad and Ailamaki, Anastasia}, + abstract={Micro-architectural behavior of traditional disk-based online transaction processing (OLTP) systems has been investigated extensively over the past couple of decades. Results show that traditional OLTP systems mostly under-utilize the available micro-architectural resources. In-memory OLTP systems, on the other hand, process all the data in main-memory and, therefore, can omit the buffer pool. Furthermore, they usually adopt more lightweight concurrency control mechanisms, cache-conscious data structures, and cleaner codebases since they are usually designed from scratch. Hence, we expect significant differences in micro-architectural behavior when running OLTP on platforms optimized for in-memory processing as opposed to disk-based database systems. In particular, we expect that in-memory systems exploit micro-architectural features such as instruction and data caches significantly better than disk-based systems. This paper sheds light on the micro-architectural behavior of in-memory database systems by analyzing and contrasting it to the behavior of disk-based systems when running OLTP workloads. The results show that, despite all the design changes, in-memory OLTP exhibits very similar micro-architectural behavior to disk-based OLTP: more than half of the execution time goes to memory stalls where instruction cache misses or the long-latency data misses from the last-level cache (LLC) are the dominant factors in the overall execution time. Even though ground-up designed in-memory systems can eliminate the instruction cache misses, the reduction in instruction stalls amplifies the impact of LLC data misses. As a result, only 30% of the CPU cycles are used to retire instructions, and 70% of the CPU cycles are wasted to stalls for both traditional disk-based and new generation in-memory OLTP.}, + url={http://infoscience.epfl.ch/record/284817}, + venue_short={VLDBJ '21}} + + @article{Dauterman:287869, - title = {SafetyPin: Encrypted Backups with Human-Memorable Secrets}, - author = {Dauterman, Emma and Corrigan-Gibbs, Henry and Mazieres, David}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)}, - address = {Berkeley}, - pages = {1121-1138}, - venue_short = {OSDI '20}, - year = {2020}, - abstract = {We present the design and implementation of SafetyPin, a system for encrypted mobile-device backups. Like existing cloud-based mobile-backup systems, including those of Apple and Google, SafetyPin requires users to remember only a short PIN and defends against brute-force PIN-guessing attacks using hardware security protections. Unlike today's systems, SafetyPin splits trust over a cluster of hardware security modules (HSMs) in order to provide security guarantees that scale with the number of HSMs. In this way, SafetyPin protects backed-up user data even against an attacker that can adaptively compromise many of the system's constituent HSMs. SafetyPin provides this protection without sacrificing scalability or fault tolerance. Decentralizing trust while respecting the resource limits of today's HSMs requires a synthesis of systems-design principles and cryptographic tools. We evaluate SafetyPin on a cluster of 100 low-cost HSMs and show that a SafetyPin-protected recovery takes 1.01 seconds. To process 1B recoveries a venue_short, we estimate that a SafetyPin deployment would need 3,100 low-cost HSMs.}, - url = {http://infoscience.epfl.ch/record/287869}, -} -@article{Pirelli:282923, - title = {A Simpler and Faster NIC Driver Model for Network Functions}, - author = {Pirelli, Solal and Candea, George}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 14th Usenix Symposium On Operating Systems Design And Implementation (OSDI'20)}, - address = {Berkeley}, - venue_short = {OSDI '20}, - year = {2020}, - abstract = {The advent of software network functions calls for stronger correctness guarantees and higher performance at every level of the stack. Current network stacks trade simplicity for performance and flexibility, especially in their driver model. We show that performance and simplicity can coexist, at the cost of some flexibility, with a new NIC driver model tailored to network functions. The key idea behind our model is that the driver can efficiently reuse packet buffers because buffers follow a single logical path. We implement a driver for the Intel 82599 network card in 550 lines of code. By merely replacing the state-of-theart driver with our driver, formal verification of the entire software stack completes in 7x less time, while the verified functions’ throughput improves by 160%. Our driver also beats, on realistic workloads, the throughput of drivers that cannot yet be formally verified, thanks to its low variability and resource use. Our code is available at github.com/dslab-epfl/tinynf.}, - url = {http://infoscience.epfl.ch/record/282923}, -} + title={SafetyPin: Encrypted Backups with Human-Memorable Secrets}, + author={Dauterman, Emma and Corrigan-Gibbs, Henry and Mazieres, David}, + abstract={We present the design and implementation of SafetyPin, a system for encrypted mobile-device backups. Like existing cloud-based mobile-backup systems, including those of Apple and Google, SafetyPin requires users to remember only a short PIN and defends against brute-force PIN-guessing attacks using hardware security protections. Unlike today's systems, SafetyPin splits trust over a cluster of hardware security modules (HSMs) in order to provide security guarantees that scale with the number of HSMs. In this way, SafetyPin protects backed-up user data even against an attacker that can adaptively compromise many of the system's constituent HSMs. SafetyPin provides this protection without sacrificing scalability or fault tolerance. Decentralizing trust while respecting the resource limits of today's HSMs requires a synthesis of systems-design principles and cryptographic tools. We evaluate SafetyPin on a cluster of 100 low-cost HSMs and show that a SafetyPin-protected recovery takes 1.01 seconds. To process 1B recoveries a venue_short, we estimate that a SafetyPin deployment would need 3,100 low-cost HSMs.}, + url={http://infoscience.epfl.ch/record/287869}, + venue_short={OSDI '20}} + + @article{Cohn-Gordon:287859, - title = {DELF: Safeguarding Deletion Correctness in Online Social Networks}, - author = {Cohn-Gordon, Katriel and Damaskinos, Georgios and Neto, Divino and Cordova, Shi and Reitz, Benoit and Strahs, Benjamin and Obenshain, Daniel and Pearce, Paul and Papagiannis, Loannis}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 29th Usenix Security Symposium}, - address = {Berkeley}, - pages = {1057-1074}, - venue_short = {USENIX Security '20}, - year = {2020}, - abstract = {Deletion is a core facet of Online Social Networks (OSNs). For users, deletion is a tool to remove what they have shared and control their data. For OSNs, robust deletion is both an obligation to their users and a risk when developer mistakes inevitably occur. While developers are effective at identifying high-level deletion requirements in products (e.g., users should be able to delete posted photos), they are less effective at mapping high-level requirements into concrete operations (e.g., deleting all relevant items in data stores). Without framework support, developer mistakes lead to violations of users' privacy, such as retaining data that should be deleted, deleting the wrong data, and exploitable vulnerabilities.}, - url = {http://infoscience.epfl.ch/record/287859}, -} -@article{Ispoglou:287822, - title = {FuzzGen: Automatic Fuzzer Generation}, - author = {Ispoglou, Kyriakos K. and Austin, Daniel and Mohan, Vishwath and Payer, Mathias}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 29th Usenix Security Symposium}, - address = {Berkeley}, - pages = {2271-2287}, - venue_short = {USENIX Security '20}, - year = {2020}, - abstract = {Fuzzing is a testing technique to discover unknown vulnerabilities in software. When applying fuzzing to libraries, the core idea of supplying random input remains unchanged, yet it is non-trivial to achieve good code coverage. Libraries cannot run as standalone programs, but instead are invoked through another application. Triggering code deep in a library remains challenging as specific sequences of API calls are required to build up the necessary state. Libraries are diverse and have unique interfaces that require unique fuzzers, so far written by a human analyst.}, - url = {http://infoscience.epfl.ch/record/287822}, -} -@article{Clements:287890, - title = {HALucinator: Firmware Re-hosting Through Abstraction Layer Emulation}, - author = {Clements, Abraham A. and Gustafson, Eric and Scharnowski, Tobias and Grosen, Paul and Fritz, David and Kruegel, Christopher and Vigna, Giovanni and Bagchi, Saurabh and Payer, Mathias}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 29th Usenix Security Symposium}, - address = {Berkeley}, - pages = {1201-1218}, - venue_short = {USENIX Security '20}, - year = {2020}, - abstract = {Given the increasing ubiquity of online embedded devices, analyzing their firmware is important to security, privacy, and safety. The tight coupling between hardware and firmware and the diversity found in embedded systems makes it hard to perform dynamic analysis on firmware. However, firmware developers regularly develop code using abstractions, such as Hardware Abstraction Layers (HALs), to simplify their job. We leverage such abstractions as the basis for the re-hosting and analysis of firmware. By providing high-level replacements for HAL functions (a process termed High-Level Emulation - HLE), we decouple the hardware from the firmware. This approach works by first locating the library functions in a firmware sample, through binary analysis, and then providing generic implementations of these functions in a full-system emulator.}, - url = {http://infoscience.epfl.ch/record/287890}, -} -@article{Peng:287847, - title = {USBFuzz: A Framework for Fuzzing USB Drivers by Device Emulation}, - author = {Peng, Hui and Payer, Mathias}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 29th Usenix Security Symposium}, - address = {Berkeley}, - pages = {2559-2575}, - venue_short = {USENIX Security '20}, - year = {2020}, - abstract = {The Universal Serial Bus (USB) connects external devices to a host. This interface exposes the OS kernels and device drivers to attacks by malicious devices. Unfortunately, kernels and drivers were developed under a security model that implicitly trusts connected devices. Drivers expect faulty hardware but not malicious attacks. Similarly, security testing drivers is challenging as input must cross the hardware/software barrier. Fuzzing, the most widely used bug finding technique, relies on providing random data to programs. However, fuzzing device drivers is challenging due to the difficulty in crossing the hardware/software barrier and providing random device data to the driver under test.}, - url = {http://infoscience.epfl.ch/record/287847}, -} -@article{Anadiotis:282822, - title = {A System Design for Elastically Scaling Transaction Processing Engines in Virtualized Servers}, - author = {Anadiotis, Angelos-Christos and Appuswamy, Raja and Ailamaki, Anastasia and Bronshtein, Ilan and Avni, Hillel and Dominguez-Sal, David and Goikhman, Shay and Levy, Eliezer}, - publisher = {ASSOC COMPUTING MACHINERY}, - journal = {Proceedings of the VLDB Endowment}, - address = {New York}, - number = {12}, - volume = {13}, - pages = {3085-3098}, - venue_short = {VLDB '20}, - year = {2020}, - abstract = {Online Transaction Processing (OLTP) deployments are migrating from on-premise to cloud settings in order to exploit the elasticity of cloud infrastructure which allows them to adapt to workload variations. However, cloud adaptation comes at the cost of redesigning the engine, which has led to the introduction of several, new, cloud-based transaction processing systems mainly focusing on: (i) the transaction coordination protocol, (ii) the data partitioning strategy, and, (iii) the resource isolation across multiple tenants. As a result, standalone OLTP engines cannot be easily deployed with an elastic setting in the cloud and they need to migrate to another, specialized deployment.}, - url = {http://infoscience.epfl.ch/record/282822}, - doi = {10.14778/3415478.3415536}, -} -@article{Sirin:280879, - title = {Micro-architectural Analysis of OLAP: Limitations and Opportunities}, - author = {Sirin, Utku and Ailamaki, Anastasia}, - publisher = {ASSOC COMPUTING MACHINERY}, - journal = {Proceedings of the VLDB Endowment},, - address = {New York}, - number = {6}, - volume = {13}, - pages = {840-853}, - venue_short = {VLDB '20}, - year = {2020}, - abstract = {Understanding micro-architectural behavior is important for efficiently using hardware resources. Recent work has shown that in-memory online transaction processing (OLTP) systems severely underutilize their core micro-architecture resources [29]. Whereas, online analytical processing (OLAP) workloads exhibit a completely different computing pattern. OLAP workloads are read-only, bandwidth-intensive, and include various data access patterns. With the rise of column-stores, they run on high-performance engines that are tightly optimized for modern hardware. Consequently, micro-architectural behavior of modern OLAP systems remains unclear.}, - url = {http://infoscience.epfl.ch/record/280879}, - doi = {10.14778/3380750.3380755}, -} + title={DELF: Safeguarding Deletion Correctness in Online Social Networks}, + author={Cohn-Gordon, Katriel and Damaskinos, Georgios and Neto, Divino and Cordova, Shi and Reitz, Benoit and Strahs, Benjamin and Obenshain, Daniel and Pearce, Paul and Papagiannis, Loannis}, + abstract={Deletion is a core facet of Online Social Networks (OSNs). For users, deletion is a tool to remove what they have shared and control their data. For OSNs, robust deletion is both an obligation to their users and a risk when developer mistakes inevitably occur. While developers are effective at identifying high-level deletion requirements in products (e.g., users should be able to delete posted photos), they are less effective at mapping high-level requirements into concrete operations (e.g., deleting all relevant items in data stores). Without framework support, developer mistakes lead to violations of users' privacy, such as retaining data that should be deleted, deleting the wrong data, and exploitable vulnerabilities.}, + url={http://infoscience.epfl.ch/record/287859}, + venue_short={USENIX Security '20}} + + @article{Olma:275745, - title = {Adaptive Partitioning and Indexing for In-situ Query Processing}, - author = {Olma, Matthaios and Karpathiotakis, Manos and Alagiannis, Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia}, - publisher = {SPRINGER}, - journal = {The VLDB Journal}, - address = {New York}, - number = {1}, - volume = {29}, - pages = {569-591}, - venue_short = {VLDBJ '20}, - year = {2020}, - abstract = {The constant flux of data and queries alike has been pushing the boundaries of data analysis systems. The increasing size of raw data files has made data loading an expensive operation that delays the data-to-insight time. To alleviate the loading cost, in situ query processing systems operate directly over raw data and offer instant access to data. At the same time, analytical workloads have increasing number of queries. Typically, each query focuses on a constantly shifting-yet small-range. As a result, minimizing the workload latency requires the benefits of indexing in in situ query processing. In this paper, we present an online partitioning and indexing scheme, along with a partitioning and indexing tuner tailored for in situ querying engines. The proposed system design improves query execution time by taking into account user query patterns, to (i) partition raw data files logically and (ii) build lightweight partition-specific indexes for each partition. We build an in situ query engine called Slalom to showcase the impact of our design. Slalom employs adaptive partitioning and builds non-obtrusive indexes in different partitions on-the-fly based on lightweight query access pattern monitoring. As a result of its lightweight nature, Slalom achieves efficient query processing over raw data with minimal memory consumption. Our experimentation with both microbenchmarks and real-life workloads shows that Slalom outperforms state-of-the-art in situ engines and achieves comparable query response times with fully indexed DBMS, offering lower cumulative query execution times for query workloads with increasing size and unpredictable access patterns.}, - url = {http://infoscience.epfl.ch/record/275745}, - doi = {10.1007/s00778-019-00580-x}, -} -@article{Daglis:265809, - title = {RPCValet: NI-Driven Tail-Aware Balancing of µs-Scale RPCs}, - author = {Daglis, Alexandros and Sutherland, Mark and Falsafi, Babak}, - publisher = {ACM}, - journal = {Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS'19)}, - address = {New York, NY}, - pages = {35-48}, - venue_short = {ASPLOS '19}, - year = {2019}, - abstract = {Modern online services come with stringent quality requirements in terms of response time tail latency. Because of their decomposition into fine-grained communicating software layers, a single user request fans out into a plethora of short, μs-scale RPCs, aggravating the need for faster inter-server communication. In reaction to that need, we are witnessing a technological transition characterized by the emergence of hardware-terminated user-level protocols (e.g., InfiniBand/RDMA) and new architectures with fully integrated Network Interfaces (NIs). Such architectures offer a unique opportunity for a new NI-driven approach to balancing RPCs among the cores of manycore server CPUs, yielding major tail latency improvements for μs-scale RPCs. We introduce RPCValet, an NI-driven RPC load-balancing design for architectures with hardware-terminated protocols and integrated NIs, that delivers near-optimal tail latency. RPCValet's RPC dispatch decisions emulate the theoretically optimal single-queue system, without incurring synchronization overheads currently associated with single-queue implementations. Our design improves throughput under tight tail latency goals by up to 1.4x, and reduces tail latency before saturation by up to 4x for RPCs with μs-scale service times, as compared to current systems with hardware support for RPC load distribution. RPCValet performs within 15% of the theoretically optimal single-queue system.}, - url = {http://infoscience.epfl.ch/record/265809}, - doi = {10.1145/3297858.3304070}, -} + title={Adaptive Partitioning and Indexing for In-situ Query Processing}, + author={Olma, Matthaios and Karpathiotakis, Manos and Alagiannis, Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia}, + abstract={The constant flux of data and queries alike has been pushing the boundaries of data analysis systems. The increasing size of raw data files has made data loading an expensive operation that delays the data-to-insight time. To alleviate the loading cost, in situ query processing systems operate directly over raw data and offer instant access to data. At the same time, analytical workloads have increasing number of queries. Typically, each query focuses on a constantly shifting-yet small-range. As a result, minimizing the workload latency requires the benefits of indexing in in situ query processing. In this paper, we present an online partitioning and indexing scheme, along with a partitioning and indexing tuner tailored for in situ querying engines. The proposed system design improves query execution time by taking into account user query patterns, to (i) partition raw data files logically and (ii) build lightweight partition-specific indexes for each partition. We build an in situ query engine called Slalom to showcase the impact of our design. Slalom employs adaptive partitioning and builds non-obtrusive indexes in different partitions on-the-fly based on lightweight query access pattern monitoring. As a result of its lightweight nature, Slalom achieves efficient query processing over raw data with minimal memory consumption. Our experimentation with both microbenchmarks and real-life workloads shows that Slalom outperforms state-of-the-art in situ engines and achieves comparable query response times with fully indexed DBMS, offering lower cumulative query execution times for query workloads with increasing size and unpredictable access patterns.}, + url={http://infoscience.epfl.ch/record/275745}, + venue_short={VLDBJ '20}} + + +@article{PourhabibiZarandi:274129, + title={Optimus Prime: Accelerating Data Transformation in Servers}, + author={Pourhabibi Zarandi, Arash and Gupta, Siddharth and Kassir, Hussein and Sutherland, Mark Johnathon and Tian, Zilu and Drumond Lages De Oliveira, Mario Paulo and Falsafi, Babak and Koch, Christoph}, + abstract={Modern online services are shifting away from monolithic applications to loosely-coupled microservices because of their improved scalability, reliability, programmability and development velocity. Microservices communicating over the datacenter network require data transformation (DT) to convert messages back and forth between their internal formats. This work identifies DT as a bottleneck due to reductions in latency of the surrounding system components, namely application runtimes, protocol stacks, and network hardware. We therefore propose Optimus Prime (OP), a programmable DT accelerator that uses a novel abstraction, an in-memory schema, to represent DT operations. The schema is compatible with today's DT frameworks and enables any compliant accelerator to perform the transformations comprising a request in parallel. Our evaluation shows that OP's DT throughput matches the line rate of today's NICs and has 60x higher throughput compared to software, at a tiny fraction of the CPU's silicon area and power. We also evaluate a set of microservices running on Thrift, and show up to 30% reduction in service latency.}, + url={http://infoscience.epfl.ch/record/274129}, + venue_short={ASPLOS '20}} + + +@article{Sutherland:277391, + title={The NEBULA RPC-Optimized Architecture}, + author={Sutherland, Mark and Gupta, Siddharth and Falsafi, Babak and Marathe, Virendra and Pnevmatikatos, Dionisios and Daglis, Alexandros and Sutherland, Mark and Gupta, Siddharth and Falsafi, Babak}, + abstract={Large-scale online services are commonly structured as a network of software tiers, which communicate over the datacenter network using RPCs. Ongoing trends towards software decomposition have led to the prevalence of tiers receiving and generating RPCs with runtimes of only a few microseconds. With such small software runtimes, even the smallest latency overheads in RPC handling have a significant relative performance impact. In particular, we find that growing network bandwidth introduces queuing effects within a server’s memory hierarchy, considerably hurting the response latency of fine-grained RPCs. In this work we introduce NeBuLa, an architecture optimized to accelerate the most challenging microsecond-scale RPCs, by leveraging two novel mechanisms to drastically improve server throughput under strict tail latency goals. First, NeBuLa reduces detrimental queuing at the memory controllers via hardware support for efficient in-LLC network buffer management. Second, NeBuLa’s network interface steers incoming RPCs into the CPU cores’ L1 caches, improving RPC startup latency. Our evaluation shows that NeBuLa boosts the throughput of a state-of-the-art key- value store by 1.25–2.19x compared to existing proposals, while maintaining strict tail latency goals.}, + url={http://infoscience.epfl.ch/record/277391}, + venue_short={ISCA '20}} + + +@article{Pirelli:282923, + title={A Simpler and Faster NIC Driver Model for Network Functions}, + author={Pirelli, Solal and Candea, George}, + abstract={The advent of software network functions calls for stronger correctness guarantees and higher performance at every level of the stack. Current network stacks trade simplicity for performance and flexibility, especially in their driver model. We show that performance and simplicity can coexist, at the cost of some flexibility, with a new NIC driver model tailored to network functions. The key idea behind our model is that the driver can efficiently reuse packet buffers because buffers follow a single logical path. We implement a driver for the Intel 82599 network card in 550 lines of code. By merely replacing the state-of-theart driver with our driver, formal verification of the entire software stack completes in 7x less time, while the verified functions’ throughput improves by 160%. Our driver also beats, on realistic workloads, the throughput of drivers that cannot yet be formally verified, thanks to its low variability and resource use. Our code is available at github.com/dslab-epfl/tinynf.}, + url={http://infoscience.epfl.ch/record/282923}, + venue_short={OSDI '20}} + + +@article{Aguilera:287833, + title={Microsecond Consensus for Microsecond Applications}, + author={Aguilera, Marcos K. and Ben-David, Naama and Guerraoui, Rachid and Marathe, Virendra J. and Xygkis, Athanasios and Zablotchi, Igor}, + abstract={We consider the problem of making apps fault-tolerant through replication, when apps operate at the microsecond scale, as in finance, embedded computing, and microservices apps. These apps need a replication scheme that also operates at the microsecond scale, otherwise replication becomes a burden. We propose Mu, a system that takes less than 1.3 microseconds to replicate a (small) request in memory, and less than a millisecond to fail-over the system-this cuts the replication and fail-over latencies of the prior systems by at least 61% and 90%. Mu implements bona fide state machine replication/consensus (SMR) with strong consistency for a generic app, but it really shines on microsecond apps, where even the smallest overhead is significant. To provide this performance, Mu introduces a new SMR protocol that carefully leverages RDMA. Roughly, in Mu a leader replicates a request by simply writing it directly to the log of other replicas using RDMA, without any additional communication. Doing so, however, introduces the challenge of handling concurrent leaders, changing leaders, garbage collecting the logs, and more-challenges that we address in this paper through a judicious combination of RDMA permissions and distributed algorithmic design. We implemented Mu and used it to replicate several systems: a financial exchange app called Liquibook, Redis, Memcached, and HERD [33]. Our evaluation shows that Mu incurs a small replication latency, in some cases being the only viable replication system that incurs an acceptable overhead.}, + url={http://infoscience.epfl.ch/record/287833}, + venue_short={OSDI '20}} + + +@article{Anadiotis:282822, + title={A System Design for Elastically Scaling Transaction Processing Engines in Virtualized Servers}, + author={Anadiotis, Angelos-Christos and Appuswamy, Raja and Ailamaki, Anastasia and Bronshtein, Ilan and Avni, Hillel and Dominguez-Sal, David and Goikhman, Shay and Levy, Eliezer}, + abstract={Online Transaction Processing (OLTP) deployments are migrating from on-premise to cloud settings in order to exploit the elasticity of cloud infrastructure which allows them to adapt to workload variations. However, cloud adaptation comes at the cost of redesigning the engine, which has led to the introduction of several, new, cloud-based transaction processing systems mainly focusing on: (i) the transaction coordination protocol, (ii) the data partitioning strategy, and, (iii) the resource isolation across multiple tenants. As a result, standalone OLTP engines cannot be easily deployed with an elastic setting in the cloud and they need to migrate to another, specialized deployment.}, + url={http://infoscience.epfl.ch/record/282822}, + venue_short={VLDB '20}} + + +@article{KokorisKogias:287444, + title={CALYPSO: Private Data Management for Decentralized Ledgers}, + author={Kokoris Kogias, Eleftherios and Alp, Enis Ceyhun and Gasser, Linus and Jovanovic, Philipp Svetolik and Syta, Ewa and Ford, Bryan Alexander}, + abstract={Distributed ledgers provide high availability and integrity, making them a key enabler for practical and secure computation of distributed workloads among mutually distrustful parties. Many practical applications also require strong confidentiality, however. This work enhances permissioned and permissionless blockchains with the ability to manage confidential data without forfeiting availability or decentralization. The proposed Calypso architecture addresses two orthogonal challenges confronting modern distributed ledgers: (a) enabling the auditable management of secrets and (b) protecting distributed computations against arbitrage attacks when their results depend on the ordering and secrecy of inputs. Calypso introduces on-chain secrets, a novel abstraction that enforces atomic deposition of an auditable trace whenever users access confidential data. Calypso provides user-controlled consent management that ensures revocation atomicity and accountable anonymity. To enable permissionless deployment, we introduce an incentive scheme and provide users with the option to select their preferred trustees. We evaluated our Calypso prototype with a confidential document-sharing application and a decentralized lottery. Our benchmarks show that transaction-processing latency increases linearly in terms of security (number of trustees) and is in the range of 0.2 to 8 seconds for 16 to 128 trustees.}, + url={http://infoscience.epfl.ch/record/287444}, + venue_short={PVLDB '20}} + + +@article{Giannakopoulou:286040, + title={Cleaning Denial Constraint Violations through Relaxation}, + author={Giannakopoulou, Stella and Karpathiotakis, Manos and Ailamaki, Anastasia}, + abstract={Data cleaning is a time-consuming process that depends on the data analysis that users perform. Existing solutions treat data cleaning as a separate offline process that takes place before analysis begins. Applying data cleaning before analysis assumes a priori knowledge of the inconsistencies and the query workload, thereby requiring effort on understanding and cleaning the data that is unnecessary for the analysis. We propose an approach that performs probabilistic repair of denial constraint violations on-demand, driven by the exploratory analysis that users perform. We introduce Daisy, a system that seamlessly integrates data cleaning into the analysis by relaxing query results. Daisy executes analytical query-workloads over dirty data by weaving cleaning operators into the query plan. Our evaluation shows that Daisy adapts to the workload and outperforms traditional offline cleaning on both synthetic and real-world workloads.}, + url={http://infoscience.epfl.ch/record/286040}, + venue_short={SIGMOD '20}} + + +@article{Raza:286055, + title={Adaptive HTAP through Elastic Resource Scheduling}, + author={Raza, Aunn and Chrysogelos, Periklis and Anadiotis, Angelos Christos and Ailamaki, Anastasia}, + abstract={Modern Hybrid Transactional/Analytical Processing (HTAP) systems use an integrated data processing engine that performs analytics on fresh data, which are ingested from a transactional engine. HTAP systems typically consider data freshness at design time, and are optimized for a fixed range of freshness requirements, addressed at a performance cost for either OLTP or OLAP. The data freshness and the performance requirements of both engines, however, may vary with the workload. We approach HTAP as a scheduling problem, addressed at runtime through elastic resource management. We model an HTAP system as a set of three individual engines: an OLTP, an OLAP and a Resource and Data Exchange (RDE) engine. We devise a scheduling algorithm which traverses the HTAP design spectrum through elastic resource management, to meet the workload data freshness requirements. We propose an in-memory system design which is non-intrusive to the current state-of-art OLTP and OLAP engines, and we use it to evaluate the performance of our approach. Our evaluation shows that the performance benefit of our system for OLAP queries increases over time, reaching up to 50% compared to static schedules for 100 query sequences, while maintaining a small, and controlled, drop in the OLTP throughput.}, + url={http://infoscience.epfl.ch/record/286055}, + venue_short={SIGMOD '20}} + + +@article{Dinesh:284290, + title={RetroWrite: Statically Instrumenting COTS Binaries for Fuzzing and Sanitization}, + author={Dinesh, Sushant and Burow, Nathan and Xu, Dongyan and Payer, Mathias}, + abstract={Analyzing the security of closed source binaries is currently impractical for end-users, or even developers who rely on third-party libraries. Such analysis relies on automatic vulnerability discovery techniques, most notably fuzzing with sanitizers enabled. The current state of the art for applying fuzzing or sanitization to binaries is dynamic binary translation, which has prohibitive performance overhead. The alternate technique, static binary rewriting, cannot fully recover symbolization information and hence has difficulty modifying binaries to track code coverage for fuzzing or to add security checks for sanitizers.}, + url={http://infoscience.epfl.ch/record/284290}, + venue_short={IEEE Security and Privacy '20}} + + @article{Oh:276919, - title = {Linebacker: Preserving Victim Cache Lines in Idle Register Files of GPUs}, - author = {Oh, Yunho and Koo, Gunjae and Annavaram, Murali and Ro, Won Woo}, - publisher = {ASSOC COMPUTING MACHINERY}, - journal = {Proceedings Of The 2019 46Th International Symposium On Computer Architecture (ISCA'19)}, - address = {New York}, - pages = {183-196}, - venue_short = {ISCA '19}, - year = {2019}, - abstract = {Modern GPUs suffer from cache contention due to the limited cache size that is shared across tens of concurrently running warps. To increase the per-warp cache size prior techniques proposed warp throttling which limits the number of active warps. Warp throttling leaves several registers to be dynamically unused whenever a warp is throttled. Given the stringent cache size limitation in GPUs this work proposes a new cache management technique named Linebacker (LB) that improves GPU performance by utilizing idle register file space as victim cache space. Whenever a CTA becomes inactive, linebacker backs up the registers of the throttled CTA to the off-chip memory. Then, linebacker utilizes the corresponding register file space as victim cache space. If any load instruction finds data in the victim cache line, the data is directly copied to the destination register through a simple register-register move operation. To further improve the efficiency of victim cache linebacker allocates victim cache space only to a select few load instructions that exhibit high data locality. Through a careful design of victim cache indexing and management scheme linebacker provides 29.0% of speedup compared to the previously proposed warp throttling techniques.}, - url = {http://infoscience.epfl.ch/record/276919}, - doi = {10.1145/3307650.3322222}, -} -@article{Gupta:270293, - title = {Distributed Logless Atomic Durability with Persistent Memory}, - author = {Gupta, Siddharth and Daglis, Alexandros and Falsafi, Babak}, - journal = {Proceedings of the 52nd International Symposium on Microarchitecture (MICRO'19)}, - pages = {13}, - venue_short = {MICRO '19}, - year = {2019}, - abstract = {Datacenter operators have started deploying Persistent Memory (PM), leveraging its combination of fast access and persistence for significant performance gains. A key challenge for PM-aware software is to maintain high performance while achieving atomic durability. The latter typically requires the use of logging, which introduces considerable overhead with additional CPU cycles, write traffic, and ordering requirements. In this paper, we exploit the data multiversioning inherent in the memory hierarchy to achieve atomic durability without logging. Our design, LAD, relies on persistent buffering space at the memory controllers (MCs)—already present in modern CPUs—to speculatively accumulate all of a transaction’s updates before they are all atomically committed to PM. LAD employs an on-chip distributed commit protocol in hardware to manage the distributed speculative state each transaction accumulates across multiple MCs. We demonstrate that LAD is a practical design relying on modest hardware modifications to provide atomically durable transactions, while delivering up to 80% of ideal—i.e., PM-oblivious software’s—performance.}, - url = {http://infoscience.epfl.ch/record/270293}, - doi = {10.1145/3352460.3358321}, -} + title={Linebacker: Preserving Victim Cache Lines in Idle Register Files of GPUs}, + author={Oh, Yunho and Koo, Gunjae and Annavaram, Murali and Ro, Won Woo}, + abstract={Modern GPUs suffer from cache contention due to the limited cache size that is shared across tens of concurrently running warps. To increase the per-warp cache size prior techniques proposed warp throttling which limits the number of active warps. Warp throttling leaves several registers to be dynamically unused whenever a warp is throttled. Given the stringent cache size limitation in GPUs this work proposes a new cache management technique named Linebacker (LB) that improves GPU performance by utilizing idle register file space as victim cache space. Whenever a CTA becomes inactive, linebacker backs up the registers of the throttled CTA to the off-chip memory. Then, linebacker utilizes the corresponding register file space as victim cache space. If any load instruction finds data in the victim cache line, the data is directly copied to the destination register through a simple register-register move operation. To further improve the efficiency of victim cache linebacker allocates victim cache space only to a select few load instructions that exhibit high data locality. Through a careful design of victim cache indexing and management scheme linebacker provides 29.0% of speedup compared to the previously proposed warp throttling techniques.}, + url={http://infoscience.epfl.ch/record/276919}, + venue_short={ISCA '19}} + + +@article{Shamis:273429, + title={Fast General Distributed Transactions with Opacity}, + author={Shamis, Alex and Renzelmann, Matthew and Novakovic, Stanko and Chatzopoulos, Georgios and Dragojevic, Aleksandar and Narayanan, Dushyanth and Castro, Miguel}, + abstract={Transactions can simplify distributed applications by hiding data distribution, concurrency, and failures from the application developer. Ideally the developer would see the abstraction of a single large machine that runs transactions sequentially and never fails. This requires the transactional subsystem to provide opacity (strict serializability for both committed and aborted transactions), as well as transparent fault tolerance with high availability. As even the best abstractions are unlikely to be used if they perform poorly, the system must also provide high performance. Existing distributed transactional designs either weaken this abstraction or are not designed for the best performance within a data center. This paper extends the design of FaRM - which provides strict serializability only for committed transactions - to provide opacity while maintaining FaRM's high throughput, low latency, and high availability within a modern data center. It uses timestamp ordering based on real time with clocks synchronized to within tens of microseconds across a cluster, and a failover protocol to ensure correctness across clock master failures. FaRM with opacity can commit 5.4 million neworder transactions per second when running the TPC-C transaction mix on 90 machines with 3-way replication.}, + url={http://infoscience.epfl.ch/record/273429}, + venue_short={SIGMOD '19}} + + +@article{Cohen:263802, + title={Fine-Grain Checkpointing with In-Cache-Line Logging}, + author={Cohen, Nachshon and Aksun, David Teksen and Larus, James}, + abstract={Non-Volatile Memory offers the possibility of implementing high-performance, durable data structures. However, achieving performance comparable to well-designed data structures in non-persistent (transient) memory is difficult, primarily because of the cost of ensuring the order in which memory writes reach NVM. Often, this requires flushing data to NVM and waiting a full memory round-trip time. In this paper, we introduce two new techniques: Fine- Grained Checkpointing, which ensures a consistent, quickly recoverable data structure in NVM after a system failure, and In-Cache-Line Logging, an undo-logging technique that enables recovery of earlier state without requiring cache- line flushes in the normal case. We implemented these techniques in the Masstree data structure, making it persistent and demonstrating the ease of applying them to a highly op- timized system and their low (5.9-15.4%) runtime overhead cost.}, + url={http://infoscience.epfl.ch/record/263802}, + venue_short={ASPLOS '19}} + + +@article{Daglis:265809, + title={RPCValet: NI-Driven Tail-Aware Balancing of µs-Scale RPCs}, + author={Daglis, Alexandros and Sutherland, Mark and Falsafi, Babak}, + abstract={Modern online services come with stringent quality requirements in terms of response time tail latency. Because of their decomposition into fine-grained communicating software layers, a single user request fans out into a plethora of short, μs-scale RPCs, aggravating the need for faster inter-server communication. In reaction to that need, we are witnessing a technological transition characterized by the emergence of hardware-terminated user-level protocols (e.g., InfiniBand/RDMA) and new architectures with fully integrated Network Interfaces (NIs). Such architectures offer a unique opportunity for a new NI-driven approach to balancing RPCs among the cores of manycore server CPUs, yielding major tail latency improvements for μs-scale RPCs. We introduce RPCValet, an NI-driven RPC load-balancing design for architectures with hardware-terminated protocols and integrated NIs, that delivers near-optimal tail latency. RPCValet's RPC dispatch decisions emulate the theoretically optimal single-queue system, without incurring synchronization overheads currently associated with single-queue implementations. Our design improves throughput under tight tail latency goals by up to 1.4x, and reduces tail latency before saturation by up to 4x for RPCs with μs-scale service times, as compared to current systems with hardware support for RPC load distribution. RPCValet performs within 15% of the theoretically optimal single-queue system.}, + url={http://infoscience.epfl.ch/record/265809}, + venue_short={ASPLOS '19}} + + +@article{Bhattacharyya:270782, + title={SMoTherSpectre: Exploiting Speculative Execution through Port Contention}, + author={Bhattacharyya, Atri and Sandulescu, Alexandra and Neugschwandtner, Matthias and Sorniotti, Alessandro and Falsafi, Babak and Payer, Mathias Josef and Kurmus, Anil}, + abstract={Spectre, Meltdown, and related attacks have demonstrated that kernels, hypervisors, trusted execution environments, and browsers are prone to information disclosure through micro-architectural weaknesses. However, it remains unclear as to what extent other applications, in particular those that do not load attacker-provided code, may be impacted. It also remains unclear as to what extent these attacks are reliant on cache-based side channels. We introduce SMoTherSpectre, a speculative code-reuse attack that leverages port-contention in simultaneously multi-threaded processors (SMoTher) as a side channel to leak information from a victim process. SMoTher is a fine-grained side channel that detects contention based on a single victim instruction. To discover real-world gadgets, we describe a methodology and build a tool that locates SMoTher-gadgets in popular libraries. In an evaluation on glibc, we found hundreds of gadgets that can be used to leak information. Finally, we demonstrate proof-of-concept attacks against the OpenSSH server, creating oracles for determining four host key bits, and against an application performing encryption using the OpenSSL library, creating an oracle which can differentiate a bit of the plaintext through gadgets in libcrypto and glibc.}, + url={http://infoscience.epfl.ch/record/270782}, + venue_short={CCS '19}} + + +@article{Srivastava:276976, + title={FirmFuzz: Automated IoT Firmware Introspection and Analysis}, + author={Srivastava, Prashast and Peng, Hui and Li, Jiahao and Okhravi, Hamed and Shrobe, Howard and Payer, Mathias}, + abstract={While the number of IoT devices grows at an exhilarating pace their security remains stagnant. Imposing secure coding standards across all vendors is infeasible. Testing individual devices allows an analyst to evaluate their security post deployment. Any discovered vulnerabilities can then be disclosed to the vendors in order to assist them in securing their products. The search for vulnerabilities should ideally be automated for efficiency and furthermore be device-independent for scalability.}, + url={http://infoscience.epfl.ch/record/276976}, + venue_short={CCS '19}} + + @article{Margaritov:276507, - title = {Prefetched Address Translation}, - author = {Margaritov, Artemiy and Ustiugov, Dmitrii and Bugnion, Edouard and Grot, Boris}, - publisher = {ASSOC COMPUTING MACHINERY}, - journal = {Proceedings of the 52nd International Symposium on Microarchitecture (MICRO'19)}, - address = {New York}, - pages = {1023-1036}, - venue_short = {MICRO '19}, - year = {2019}, - abstract = {With explosive growth in dataset sizes and increasing machine memory capacities, per-application memory footprints are commonly reaching into hundreds of GBs. Such huge datasets pressure the TLB, resulting in frequent misses that must be resolved through a page walk - a long-latency pointer chase through multiple levels of the in-memory radix tree-based page table.}, - url = {http://infoscience.epfl.ch/record/276507}, - doi = {10.1145/3352460.3358294}, -} + title={Prefetched Address Translation}, + author={Margaritov, Artemiy and Ustiugov, Dmitrii and Bugnion, Edouard and Grot, Boris}, + abstract={With explosive growth in dataset sizes and increasing machine memory capacities, per-application memory footprints are commonly reaching into hundreds of GBs. Such huge datasets pressure the TLB, resulting in frequent misses that must be resolved through a page walk - a long-latency pointer chase through multiple levels of the in-memory radix tree-based page table.}, + url={http://infoscience.epfl.ch/record/276507}, + venue_short={MICRO '19}} + + +@article{Gupta:270293, + title={Distributed Logless Atomic Durability with Persistent Memory}, + author={Gupta, Siddharth and Daglis, Alexandros and Falsafi, Babak}, + abstract={Datacenter operators have started deploying Persistent Memory (PM), leveraging its combination of fast access and persistence for significant performance gains. A key challenge for PM-aware software is to maintain high performance while achieving atomic durability. The latter typically requires the use of logging, which introduces considerable overhead with additional CPU cycles, write traffic, and ordering requirements. In this paper, we exploit the data multiversioning inherent in the memory hierarchy to achieve atomic durability without logging. Our design, LAD, relies on persistent buffering space at the memory controllers (MCs)—already present in modern CPUs—to speculatively accumulate all of a transaction’s updates before they are all atomically committed to PM. LAD employs an on-chip distributed commit protocol in hardware to manage the distributed speculative state each transaction accumulates across multiple MCs. We demonstrate that LAD is a practical design relying on modest hardware modifications to provide atomically durable transactions, while delivering up to 80% of ideal—i.e., PM-oblivious software’s—performance.}, + url={http://infoscience.epfl.ch/record/270293}, + venue_short={MICRO '19}} + + @article{Iyer:264778, - title = {Performance Contracts for Software Network Functions}, - author = {Iyer, Rishabh Ramesh and Pedrosa, Luis David and Zaostrovnykh, Arseniy and Pirelli, Solal and Argyraki, Katerina and Candea, George}, - publisher = {USENIX ASSOC}, - journal = {Proceedings of the ACM Symposium on Networked Systems Design and Implementation (NSDI'19)}, - address = {Berkeley}, - pages = {517-530}, - venue_short = {NSDI '19}, - year = {2019}, - abstract = {Software network functions (NFs), or middleboxes, promise flexibility and easy deployment of network services but face the serious challenge of unexpected performance behaviour. We propose the notion of a performance contract, a construct formulated in terms of performance critical variables, that provides a precise description of NF performance. Performance contracts enable fine-grained prediction and scrutiny of NF performance for arbitrary workloads, without having to run the NF itself. We describe BOLT, a technique and tool for computing such performance contracts for the entire software stack of NFs written in C, including the core NF logic, DPDK packet processing framework, and NIC driver. BOLT takes as input the NF implementation code and outputs the corresponding contract. Under the covers, it combines pre-analysis of a library of stateful NF data structures with automated symbolic execution of the NF’s code. We evaluate BOLT on four NFs—a Maglev-like load balancer, a NAT, an LPM router, and a MAC bridge—and show that its performance contracts predict the dynamic instruction count and memory access count with a maximum gap of 7% between the real execution and the conservatively predicted upper bound. With further engineering, this gap can be reduced.}, - url = {http://infoscience.epfl.ch/record/264778}, -} -@article{Shamis:273429, - title = {Fast General Distributed Transactions with Opacity}, - author = {Shamis, Alex and Renzelmann, Matthew and Novakovic, Stanko and Chatzopoulos, Georgios and Dragojevic, Aleksandar and Narayanan, Dushyanth and Castro, Miguel}, - publisher = {ASSOC COMPUTING MACHINERY}, - journal = {Proceedings Of The 2019 International Conference On Management Of Data (SIGMOD'19)}, - address = {New York}, - series = {International Conference on Management of Data}, - pages = {433-448}, - venue_short = {SIGMOD '19}, - year = {2019}, - abstract = {Transactions can simplify distributed applications by hiding data distribution, concurrency, and failures from the application developer. Ideally the developer would see the abstraction of a single large machine that runs transactions sequentially and never fails. This requires the transactional subsystem to provide opacity (strict serializability for both committed and aborted transactions), as well as transparent fault tolerance with high availability. As even the best abstractions are unlikely to be used if they perform poorly, the system must also provide high performance. Existing distributed transactional designs either weaken this abstraction or are not designed for the best performance within a data center. This paper extends the design of FaRM - which provides strict serializability only for committed transactions - to provide opacity while maintaining FaRM's high throughput, low latency, and high availability within a modern data center. It uses timestamp ordering based on real time with clocks synchronized to within tens of microseconds across a cluster, and a failover protocol to ensure correctness across clock master failures. FaRM with opacity can commit 5.4 million neworder transactions per second when running the TPC-C transaction mix on 90 machines with 3-way replication.}, - url = {http://infoscience.epfl.ch/record/273429}, - doi = {10.1145/3299869.3300069}, -} + title={Performance Contracts for Software Network Functions}, + author={Iyer, Rishabh Ramesh and Pedrosa, Luis David and Zaostrovnykh, Arseniy and Pirelli, Solal and Argyraki, Katerina and Candea, George}, + abstract={Software network functions (NFs), or middleboxes, promise flexibility and easy deployment of network services but face the serious challenge of unexpected performance behaviour. We propose the notion of a performance contract, a construct formulated in terms of performance critical variables, that provides a precise description of NF performance. Performance contracts enable fine-grained prediction and scrutiny of NF performance for arbitrary workloads, without having to run the NF itself. We describe BOLT, a technique and tool for computing such performance contracts for the entire software stack of NFs written in C, including the core NF logic, DPDK packet processing framework, and NIC driver. BOLT takes as input the NF implementation code and outputs the corresponding contract. Under the covers, it combines pre-analysis of a library of stateful NF data structures with automated symbolic execution of the NF’s code. We evaluate BOLT on four NFs—a Maglev-like load balancer, a NAT, an LPM router, and a MAC bridge—and show that its performance contracts predict the dynamic instruction count and memory access count with a maximum gap of 7% between the real execution and the conservatively predicted upper bound. With further engineering, this gap can be reduced.}, + url={http://infoscience.epfl.ch/record/264778}, + venue_short={NSDI '19}} + + +@article{Chrysogelos:262531, + title={HetExchange: Encapsulating heterogeneous CPU-GPU parallelism in JIT compiled engines}, + author={Chrysogelos, Periklis and Karpathiotakis, Manos and Appuswamy, Raja and Ailamaki, Anastasia}, + abstract={Modern server hardware is increasingly heterogeneous as hardware accelerators, such as GPUs, are used together with multicore CPUs to meet the computational demands of modern data analytics workloads. Unfortunately, query parallelization techniques used by analytical database engines are designed for homogeneous multicore servers, where query plans are parallelized across CPUs to process data stored in cache coherent shared memory. Thus, these techniques are unable to fully exploit available heterogeneous hardware, where one needs to exploit task-parallelism of CPUs and data-parallelism of GPUs for processing data stored in a deep, non-cache-coherent memory hierarchy with widely varying access latencies and bandwidth. In this paper, we introduce HetExchange–a parallel query execution framework that encapsulates the heterogeneous parallelism of modern multi-CPU–multi-GPU servers and enables the parallelization of (pre-)existing sequential relational operators. In contrast to the interpreted nature of traditional Exchange, HetExchange is designed to be used in conjunction with JIT compiled engines in order to allow a tight integration with the proposed operators and generation of efficient code for heterogeneous hardware. We validate the applicability and efficiency of our design by building a prototype that can operate over both CPUs and GPUs, and enables its operators to be parallelism- and data-location-agnostic. In doing so, we show that efficiently exploiting CPU–GPU parallelism can provide 2.8x and 6.4x improvement in performance compared to state-of-the-art CPU-based and GPU-based DBMS.}, + url={http://infoscience.epfl.ch/record/262531}, + venue_short={PVLDB '19}} + + +@article{Nikolopoulos:265801, + title={Retroactive Packet Sampling for Traffic Receipts}, + author={Nikolopoulos, Pavlos and Pappas, Christos and Argyraki, Katerina and Perrig, Adrian}, + abstract={Is it possible to design a packet-sampling algorithm that prevents the network node that performs the sampling from treating the sampled packets preferentially? We study this problem in the context of designing a "network transparency" system. In this system, networks emit receipts for a small sample of the packets they observe, and a monitor collects these receipts to estimate each network's loss and delay performance. Sampling is a good building block for this system, because it enables a solution that is flexible and combines low resource cost with quantifiable accuracy. The challenge is cheating resistance: when a network's performance is assessed based on the conditions experienced by a small traffic sample, the network has a strong incentive to treat the sampled packets better than the rest. We contribute a sampling algorithm that is provably robust to such prioritization attacks, enables network performance estimation with quantifiable accuracy, and requires minimal resources. We confirm our analysis using real traffic traces.}, + url={http://infoscience.epfl.ch/record/265801}, + venue_short={SIGMETRICS '19}} + + @article{Zastrovnykh:275534, - title = {Verifying Software Network Functions with No Verification Expertise}, - author = {Zastrovnykh, Arseniy and Pirelli, Solal and Iyer, Rishabh and Rizzo, Matteo and Pedrosa, Luis and Argyraki, Katerina and Candea, George}, - publisher = {ACM}, - journal = {Proceedings of the 27th ACM Symposium on Operating Systems Principles (SOSP'19)}, - pages = {16. 275-290}, - venue_short = {SOSP '19}, - year = {2019}, - abstract = {We present the design and implementation of Vigor, a software stack and toolchain for building and running software network middleboxes that are guaranteed to be correct, while preserving competitive performance and developer productivity. Developers write the core of the middlebox---the network function (NF)---in C, on top of a standard packet-processing framework, putting persistent state in data structures from Vigor's library; the Vigor toolchain then automatically verifies that the resulting software stack correctly implements a specification, which is written in Python. Vigor has three key features: network function developers need no verification expertise, and the verification process does not require their assistance (push-button verification); the entire software stack is verified, down to the hardware (full-stack verification); and verification can be done in a pay-as-you-go manner, i.e., instead of investing upfront a lot of time in writing and verifying a complete specification, one can specify one-off properties in a few lines of Python and verify them without concern for the rest. We developed five representative NFs---a NAT, a Maglev load balancer, a MAC-learning bridge, a firewall, and a traffic policer---and verified with Vigor that they satisfy standards-derived specifications, are memory-safe, and do not crash or hang. We show that they provide competitive performance. The Vigor framework is available at [http://vigor.epfl.ch].}, - url = {http://infoscience.epfl.ch/record/275534}, - doi = {10.1145/3341301.3359647}, -} -@article{Tsai:275441, - title = {Pythia: Remote Oracles for the Masses}, - author = {Tsai, Shin-Yeh and Payer, Mathias and Zhang, Yiying}, - publisher = {USENIX ASSOC}, - journal = {Proceedings Of The 28th Usenix Security Symposium}, - address = {Berkeley}, - pages = {693-710}, - venue_short = {USENIX Security '19}, - year = {2019}, - abstract = {Remote Direct Memory Access (RDMA) is a technology that allows direct access from the network to a machine's main memory without involving its CPU. RDMA offers low-latency, high-bandwidth performance and low CPU utilization. While RDMA provides massive performance boosts and has thus been adopted by several major cloud providers, security concerns have so far been neglected.}, - url = {http://infoscience.epfl.ch/record/275441}, -} + title={Verifying Software Network Functions with No Verification Expertise}, + author={Zastrovnykh, Arseniy and Pirelli, Solal and Iyer, Rishabh and Rizzo, Matteo and Pedrosa, Luis and Argyraki, Katerina and Candea, George}, + abstract={We present the design and implementation of Vigor, a software stack and toolchain for building and running software network middleboxes that are guaranteed to be correct, while preserving competitive performance and developer productivity. Developers write the core of the middlebox---the network function (NF)---in C, on top of a standard packet-processing framework, putting persistent state in data structures from Vigor's library; the Vigor toolchain then automatically verifies that the resulting software stack correctly implements a specification, which is written in Python. Vigor has three key features: network function developers need no verification expertise, and the verification process does not require their assistance (push-button verification); the entire software stack is verified, down to the hardware (full-stack verification); and verification can be done in a pay-as-you-go manner, i.e., instead of investing upfront a lot of time in writing and verifying a complete specification, one can specify one-off properties in a few lines of Python and verify them without concern for the rest. We developed five representative NFs---a NAT, a Maglev load balancer, a MAC-learning bridge, a firewall, and a traffic policer---and verified with Vigor that they satisfy standards-derived specifications, are memory-safe, and do not crash or hang. We show that they provide competitive performance. The Vigor framework is available at http://vigor.epfl.ch.}, + url={http://infoscience.epfl.ch/record/275534}, + venue_short={SOSP '19}} + + +@article{Burow:275478, + title={SoK: Shining Light on Shadow Stacks}, + author={Burow, Nathan and Zhang, Xinping and Payer, Mathias}, + abstract={Control-Flow Hijacking attacks are the dominant attack vector against C/C++ programs. Control-Flow Integrity (CFI) solutions mitigate these attacks on the forward edge, i.e., indirect calls through function pointers and virtual calls. Protecting the backward edge is left to stack canaries, which are easily bypassed through information leaks. Shadow Stacks are a fully precise mechanism for protecting backwards edges, and should be deployed with CFI mitigations.}, + url={http://infoscience.epfl.ch/record/275478}, + venue_short={IEEE Security and Privacy '19}} + + +@article{Drijvers:275481, + title={On the Security of Two-Round Multi-Signatures}, + author={Drijvers, Manu and Edalatnejad, Kasra and Ford, Bryan and Kiltz, Eike and Loss, Julian and Neven, Gregory and Stepanovs, Igors}, + abstract={A multi-signature scheme allows a group of signers to collaboratively sign a message, creating a single signature that convinces a verifier that every individual signer approved the message. The increased interest in technologies to decentralize trust has triggered the proposal of highly efficient two-round Schnorr-based multi-signature schemes designed to scale up to thousands of signers, namely BCJ by Bagherzandi et al. (CCS 2008), MWLD by Ma et al. (DCC 2010), CoSi by Syta et al. (S&P 2016), and MuSig by Maxwell et al. (ePrint 2018). In this work, we point out serious security issues in all currently known two-round multi-signature schemes (without pairings). First, we prove that none of the schemes can be proved secure without radically departing from currently known techniques. Namely, we show that if the one-more discrete-logarithm problem is hard, then no algebraic reduction exists that proves any of these schemes secure under the discrete-logarithm or one-more discrete-logarithm problem. We point out subtle flaws in the published security proofs of the above schemes (except CoSi, which was not proved secure) to clarify the contradiction between our result and the existing proofs. Next, we describe practical sub exponential attacks on all schemes, providing further evidence to their insecurity. Being left without two-round multi-signature schemes, we present rnBCJ, a variant of the BCJ scheme that we prove secure under the discrete-logarithm assumption in the random-oracle model. Our experiments show that rni3CJ barely affects scalability compared to CoSi, allowing 16384 signers to collaboratively sign a message in about 2 seconds, making it a highly practical and provably secure alternative for large-scale deploy tents.}, + url={http://infoscience.epfl.ch/record/275481}, + venue_short={IEEE Security and Privacy '19}} + + diff --git a/dblpbibcloud.py b/dblpbibcloud.py new file mode 100755 index 0000000..577bce5 --- /dev/null +++ b/dblpbibcloud.py @@ -0,0 +1,283 @@ +#!/usr/bin/python3 +# -*- coding: latin-1 -*- + +#### +#### dblpbibcloud.py +#### based on bibcloud.py +#### + +# Copyright 2015-21 Ecole Polytechnique Federale Lausanne (EPFL) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# +# dblpbibcloud : +# -- all input arguments are in the code itself +# -- input: set of principal investigators (lab heads) with start date +# -- input: set of conferences +# -- input: exclusion list + + + +import sys +import os +import xml.etree.ElementTree as ET +import subprocess +import time +import locale +import requests +import functools +import bibtexparser +import itertools + +DEBUG = 0 + +AUTHORS = { + "bugnion": {"dblp":"b/EBugnion"}, + "larus" :{"dblp":"l/JamesRLarus"}, + "candea" :{"dblp":"c/GeorgeCandea"}, + "argyraki": {"dblp":"71/6861"}, + "kashyap": {"dblp":"145/0912","year":2020}, + "ailamaki" : {"dblp":"a/AnastassiaAilamaki"}, + "guerraoui" :{"dblp":"g/RachidGuerraoui"}, + "kermarrec": {"dblp":"86/676"}, + "falsafi" : {"dblp":"f/BabakFalsafi"}, + "payer" : {"dblp":"31/1273"}, + "ford" : {"dblp":"f/BryanFord"} +} + + +CONFERENCES_ORG = [ + "ASPLOS", + "SOSP", "OSDI", + "SIGCOMM", "NSDI", "ISCA", "MICRO", + "SIGMOD", "VLDB", "PVLDB", + "MobiCom", "MobiSys", "SenSys", "IMC", + "SIGMETRICS", + "PLDI", "CCS", "SP", "USS" + #### not on George's orginal list + "Eurosys" + ] + + +CONFERENCES = [x.upper() for x in CONFERENCES_ORG] +DEBUG and print(CONFERENCES) + + +CONFERENCES_PRETTY = { + "SP" : "IEEE Security and Privacy", + "USS" : "USENIX Security" +} + + +############ +### globals +############ + +PUBLICATIONS = {} +KEYPUBS = [] +BIBALL = {} + + +#### added for dblpbbibcloud + +def sort_lambda(a,b): + if a["year"] == b["year"]: + if a["conf"] < b["conf"]: + return -1 + elif a["conf"] > b["conf"]: + return 1 + else: + return 0 + else: + return b["year"]-a["year"] + + +def normalize_title(a): + x = a.find(" ") + if x==len(a)-1: + return normalize_title(a[:x]) + elif x>=0: + return normalize_title(a[:x]+a[x+1:]) + else: + dot = a[len(a)-1] + if dot == ".": + b = a[:len(a)-1] + return b.lower() + else: + return a.lower() + +def print_bibentry(p,b): + F.write("@article{"+b["ID"]+",\n") + F.write(" title={"+b["title"]+"},\n") + F.write(" author={"+b["author"]+"},\n") + F.write(" abstract={"+b["abstract"]+"},\n") + F.write(" url={"+b["url"]+"},\n") + conf = p["conf"].upper() + if conf in CONFERENCES_PRETTY: + conf = CONFERENCES_PRETTY[conf] + if "venue_short" in b: + F.write(" venue_short={"+b["venue_short"]+"}}\n\n\n") + else: + F.write(" venue_short={"+conf+" '"+str(p["year"]-2000)+"}}\n\n\n") + + +################################################### +#################### main ######################## +################################################### +# process bib file from ARVG +print("dbbibcloud: This is dblpbibcloud ... Use at your own risk ... see source for documentation") + + +if not os.path.exists(".bibcloud"): + os.mkdir(".bibcloud") + +for author in AUTHORS: + print("Fetching author",author,AUTHORS[author]) + url = "https://dblp.org/pid/" + AUTHORS[author]["dblp"] + ".xml" + f = requests.get(url) + xml = ET.ElementTree(ET.fromstring(f.text)) + root = xml.getroot() + for child in root: + if child.tag == "r": + for paper in child: + key = paper.attrib['key'] + if key in PUBLICATIONS: + PUBLICATIONS[key]["authors"].append(author) + else: + PUBLICATIONS[key] = {"xml":paper,"authors":[author]} + + +for p in PUBLICATIONS: + x = p.split("/") + xml = PUBLICATIONS[p]["xml"] + authors = PUBLICATIONS[p]["authors"] + if x[1].upper() in CONFERENCES: + year = 0 + title = "" + for c in xml: + if c.tag == "year": + year = int(c.text) + if c.tag == "title": + title = c.text + year==0 and sys.exit("could not find year for "+str(p)) + title=="" and sys.exit("could not find title for "+str(p)) + keep = 0 + for a in authors: + minyear = 2019 + if "year" in AUTHORS[a]: + minyear = AUTHORS[a]["year"] + DEBUG and print("adjuting min year for",a,minyear) + if year >= minyear: + keep = 1 + if keep: + DEBUG and print("key",p,x[1],year,authors) + KEYPUBS.append({ + "key": p, + "conf": x[1], + "year": year, + "authors": authors, + "title":normalize_title(title) + }) + +KEYPUBS.sort(key=functools.cmp_to_key(sort_lambda)) + +for x in KEYPUBS: + print("%4d %10s %30s %20s %s"% (x["year"],x["conf"],x["authors"],x["key"],x["title"])) + + +## add lab-specific file here for those who don't use infoscience +## (e.g.,gannimo) +for filename in ["bib/misc.bib"]: + with open(filename) as bibtex_file: + bib_db = bibtexparser.load(bibtex_file) + print("reading local bib file",len(bib_db.entries),"entries") + for x in bib_db.entries: + title = normalize_title(x["title"]) + found = 0 + for y in KEYPUBS: + if y["title"] == title: + print(" -- found",y["key"]) + y["bib"] = x + found = 1 + if found==0: + print(" -- not found; inserted",x["venue_short"],x["year"]), + KEYPUBS.append({"key": "NONE", + "conf": x["venue_short"], + "year": int(x["year"]), + "authors": "NONE", + "title":title, + "bib":x}) + +#sort again +KEYPUBS.sort(key=functools.cmp_to_key(sort_lambda)) + + +for author in AUTHORS: + url = "https://infoscience.epfl.ch/search?ln=en&p="+author+"&f=&rm=&ln=en&sf=&so=d&rg=1000&of=btex&fct__1=Conference+Papers" + f = requests.get(url) + bib_db = bibtexparser.loads(f.text) + print("infoscience",author,len(bib_db.entries),"entries") + for x in bib_db.entries: + title = normalize_title(x["title"]) + DEBUG and print("infoscience",author,title) + if title in BIBALL: + if (BIBALL[title]["ID"] == x["ID"]): + DEBUG and print(" === multiple author for:"+title) + else: + print() + print("!!!! Duplicateentries for title (1/2)"+title) + print(BIBALL[title]) + print("!!!! Duplicate entries for title (2/2) "+title) + print(x) + else: + BIBALL[title] = x + + + + +F = open("bib/pubs.bib","w") +F.write("%% DO NOT EDIT\n") +F.write("%% Generated by dblpbibcloud.py\n") +for p in KEYPUBS: + key = p["key"] + title = p["title"] + if title in BIBALL: + DEBUG and print(" match for",key) + if "bib" in p: + # also found locally ... which should not be the case + print() + print() + print("REMOVE from bib/misc.pub",p["bib"]) + + if "abstract" in BIBALL[title]: + print_bibentry(p,BIBALL[title]) + else: + print("No abstract in infoscience for ",p,BIBALL[title]) + elif "bib" in p: + DEBUG and print("Found locally") + print_bibentry(p,p["bib"]) + else: + print("nomatch for",p) + +F.close() +sys.exit(0) + + +