diff --git a/.gitignore b/.gitignore index 407370c..e4d3c02 100644 --- a/.gitignore +++ b/.gitignore @@ -138,5 +138,11 @@ main.glo main.out mainNotes.bib pdfa.xmpi -main.pdf + main.synctex(busy) +main.pdf +main.bbl +main.blg +main_diff.acn +main_diff.glo +main_diffNotes.bib diff --git a/_projects/json_summaries/project_21.json b/_projects/json_summaries/project_21.json index 43e1dc3..6856793 100644 --- a/_projects/json_summaries/project_21.json +++ b/_projects/json_summaries/project_21.json @@ -2,6 +2,6 @@ "project_number": 21, "project_name": "Benchmarking Molecular Descriptors with Actively Identified Subsets (MolDAIS)", "video_url": "https://www.youtube.com/watch?v=uYXAe3sRUSo", - "summary": "This research presents a novel approach called MOLDES (Molecular Descriptors with Actively Identified Subspaces) for molecular property optimization. The method addresses the challenge of optimizing molecules in high-dimensional spaces by using molecular descriptors - sets of rotationally and translationally invariant calculations performed on molecular graphs - coupled with active subspace identification. MOLDES employs a sparse axis-aligned subspace Gaussian Process prior, which actively learns an encoding while performing Bayesian optimization.The researchers evaluated MOLDES on three case studies: experimental lipophilicity (4,200 compounds), log P optimization benchmark (250,000 molecules), and power conversion efficiency from the Harvard Clean Energy Project (30,000 compounds). In all cases, MOLDES demonstrated superior performance compared to other optimizers, particularly in larger datasets. For the log P optimization, MOLDES consistently found the optimal molecule within 100 iterations. The method also showed strong performance in constrained optimization problems, often achieving the best-case scenario and maintaining a favorable worst-case scenario compared to other methods. Overall, MOLDES proved efficient in identifying high-performing molecules in low-data regimes, offering a promising approach for molecular property optimization tasks.", + "summary": "This research presents a novel approach called MOLDES (Molecular Descriptors with Actively Identified Subspaces) for molecular property optimization. The method addresses the challenge of optimizing molecules in high-dimensional spaces by using molecular descriptors - sets of rotationally and translationally invariant calculations performed on molecular graphs - coupled with active subspace identification. MOLDES employs a sparse axis-aligned subspace Gaussian Process prior, which actively learns an encoding while performing Bayesian optimization. Recent works\\cite{sorourifar_accelerating_2024,maus_local_2023} are increasingly turning towards active encoding of molecular feature spaces. The researchers evaluated MOLDES on three case studies: experimental lipophilicity (4,200 compounds), log P optimization benchmark (250,000 molecules), and power conversion efficiency from the Harvard Clean Energy Project (30,000 compounds). In all cases, MOLDES demonstrated superior performance compared to other optimizers, particularly in larger datasets. For the log P optimization, MOLDES consistently found the optimal molecule within 100 iterations. The method also showed strong performance in constrained optimization problems, often achieving the best-case scenario and maintaining a favorable worst-case scenario compared to other methods. Overall, MOLDES proved efficient in identifying high-performing molecules in low-data regimes, offering a promising approach for molecular property optimization tasks.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_22.json b/_projects/json_summaries/project_22.json index 5cca510..b5c9587 100644 --- a/_projects/json_summaries/project_22.json +++ b/_projects/json_summaries/project_22.json @@ -2,6 +2,6 @@ "project_number": 22, "project_name": "Chemical Similarity-Informed Earth Mover’s Distance Kernel Bayesian Optimization for Predicting the Properties of Molecules and Molecular Mixtures", "video_url": "https://www.youtube.com/watch?v=I179UR8P054", - "summary": "This research project focuses on developing chemical similarity-informed distance functions and kernels for explainable Bayesian optimization, specifically targeting the prediction of properties for molecular mixtures. The researchers propose a novel approach that bypasses the need for embedding vectors by directly providing pairwise distances between data points in the kernel function of a Gaussian Process (GP) model.The project introduces the Earth Mover's Distance (EMD) kernel into the GP framework to calculate pairwise distances between mixtures based on individual component distances. This method was tested for predicting yields of binary reactant mixtures, demonstrating high chemical resolution in mixture analysis. The results show that the EMD kernel achieves accurate yield predictions with narrow distributions for both high and low-yield cases, indicating improved performance in distinguishing between different mixture compositions. By incorporating smooth distance metrics, the researchers successfully extended Bayesian optimization techniques from pure components to molecular mixtures, potentially enhancing the efficiency and interpretability of materials property prediction in complex chemical systems.", + "summary": "This research project focuses on developing chemical similarity-informed distance functions and kernels for explainable Bayesian optimization, specifically targeting the prediction of properties for molecular mixtures. The researchers propose a novel approach that bypasses the need for embedding vectors by directly providing pairwise distances between data points in the kernel function of a Gaussian Process (GP) model\\cite{moss_gaussian_2020}. The project introduces the Earth Mover's Distance (EMD) kernel\\cite{hargreaves_earth_2020} into the GP framework to calculate pairwise distances between mixtures based on individual component distances. This method was tested for predicting yields of binary reactant mixtures, demonstrating high chemical resolution in mixture analysis. The results show that the EMD kernel achieves accurate yield predictions with narrow distributions for both high and low-yield cases, indicating improved performance in distinguishing between different mixture compositions. By incorporating smooth distance metrics, the researchers successfully extended Bayesian optimization techniques from pure components to molecular mixtures, potentially enhancing the efficiency and interpretability of materials property prediction in complex chemical systems.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_24.json b/_projects/json_summaries/project_24.json index 5685eb7..823c231 100644 --- a/_projects/json_summaries/project_24.json +++ b/_projects/json_summaries/project_24.json @@ -1,8 +1,7 @@ { "project_number": 24, "project_name": "ScattBO Benchmark - Bayesian optimisation for materials discovery", - "video_url": "https://github.com/AndySAnker/ScattBO/tree/main/presentation", - "summaries": null, - "status": "failed", - "error": "Could not determine the video ID for the URL \"https://github.com/AndySAnker/ScattBO/tree/main/presentation\"." + "video_url": "https://twitter.com/SodeAndy/status/1773474538631651769", + "summary": "This project presents ScattBO, a Python-based benchmark that simulates a self-driving laboratory (SDL) for materials discovery. A self-driving laboratory is an autonomous platform that conducts machine learning-selected experiments to achieve a user-defined objective, such as synthesizing a specific material\\cite{szymanski_autonomous_2023}. The benchmark addresses the challenge that such SDLs can be expensive to run, making intelligent experimental planning essential, while only a few people have access to real SDLs for materials discovery. ScattBO provides an in silico simulation of an SDL where, based on synthesis parameters, the benchmark 'synthesizes' a structure, calculates the scattering pattern\\cite{johansen_gpu-accelerated_2024}, and compares it to the target structure's scattering pattern. The benchmark acknowledges that scattering data may not be sufficient to conclusively validate that the target material has been synthesized\\cite{leeman_challenges_2024}, but can include other types of data as long as they can be simulated. This makes it currently challenging to benchmark Bayesian optimization algorithms for experimental planning tasks in SDLs, and ScattBO fills this gap by providing an accessible simulation environment.", + "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_25.json b/_projects/json_summaries/project_25.json index 6e0199e..5709c85 100644 --- a/_projects/json_summaries/project_25.json +++ b/_projects/json_summaries/project_25.json @@ -2,6 +2,6 @@ "project_number": 25, "project_name": "Bayesian Optimized De Novo Drug Design for Selective Kinase Targeting ", "video_url": "https://www.youtube.com/watch?v=nVtTYXxG7i4", - "summary": "This project focused on incorporating Bayesian optimization to guide de novo drug design, specifically targeting growth factor receptors for cancer therapeutics. The team built upon the doct string paper, Python library, and dataset by Garcia-Oron and Bacal, using a Gaussian process with a Matérn kernel on Morgan fingerprint representations. They employed a graph genetic algorithm to generate SMILES strings guided by the Bayesian optimization output.The researchers explored both selective and promiscuous binding scenarios. For selective binding, they optimized for binding to FGFR1 while penalizing overbinding to other growth factor receptors relative to their median. For promiscuous binding, they maximized the maximum binding affinity across multiple receptors. They found that a sigmoidal penalty function was more effective than simple absolute differences when optimizing against multiple proteins. The team also incorporated a drug-likeness measure (QED) as a penalty in the optimization process, though its effect was limited. Due to time and resource constraints, the project was unable to extensively explore the chemical space or use more accurate binding affinity calculations beyond docking. The authors suggest that future work could incorporate known unknowns through an evasion process, further optimize selective binding, and compare different molecular representations.", + "summary": "This project focused on incorporating Bayesian optimization to guide de novo drug design, specifically targeting growth factor receptors for cancer therapeutics. The team built upon the DOCKSTRING paper, Python library, and dataset\\cite{garcia_dockstring_2022}, using a Gaussian process with a Matérn kernel on Morgan fingerprint representations. They employed a graph genetic algorithm to generate SMILES strings guided by the Bayesian optimization output. The researchers explored both selective and promiscuous binding scenarios. For selective binding, they optimized for binding to FGFR1 while penalizing overbinding to other growth factor receptors relative to their median. For promiscuous binding, they maximized the maximum binding affinity across multiple receptors. They found that a sigmoidal penalty function was more effective than simple absolute differences when optimizing against multiple proteins. The team also incorporated a drug-likeness measure (QED)\\cite{bickerton_quantifying_2012} as a penalty in the optimization process, though its effect was limited. Due to time and resource constraints, the project was unable to extensively explore the chemical space or use more accurate binding affinity calculations beyond docking. The authors suggest that future work could incorporate known unknowns through an evasion process, further optimize selective binding, and compare different molecular representations.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_27.json b/_projects/json_summaries/project_27.json index d87d5dc..05d0543 100644 --- a/_projects/json_summaries/project_27.json +++ b/_projects/json_summaries/project_27.json @@ -2,6 +2,6 @@ "project_number": 27, "project_name": "How does initial warm-up data influence Bayesian optimization in low-data experimental settings?", "video_url": "https://www.youtube.com/watch?v=4gPTMaarQt0", - "summary": "This research project investigated the influence of warm-up sampling methods and dataset sizes on property optimization in low data regimes, specifically focusing on molecular property prediction. The team used the QM9 dataset and selected band gap as the optimization target. They compared two chemically-inspired sampling methods for the warm-up dataset: Morgan fingerprints and MolFormer language model fingerprints.The researchers performed dimensionality reduction on the fingerprints using PCA, projecting them into a 2D space for sampling. They conducted experiments to analyze how the warm-up dataset size affects optimization results. The most significant finding was the comparison between Morgan fingerprints and MolFormer fingerprints at a constant data regime of 50 data points. The results showed that MolFormer fingerprints substantially outperformed Morgan fingerprints, suggesting that pre-trained models on large chemical spaces can potentially improve model optimization rates. This study aims to initiate broader discussions on how dataset sizes and sampling methodologies impact final optimization tasks in molecular property prediction.", + "summary": "This research project investigated the influence of warm-up sampling methods and dataset sizes on property optimization in low data regimes, specifically focusing on molecular property prediction. The team used the QM9 dataset\\cite{ramakrishnan_quantum_2014} and selected band gap as the optimization target. They compared two chemically-inspired sampling methods for the warm-up dataset: Morgan fingerprints and MolFormer language model fingerprints. The researchers also referenced the GDB-17 chemical universe database\\cite{ruddigkeit_enumeration_2012} in their background work. The researchers performed dimensionality reduction on the fingerprints using PCA, projecting them into a 2D space for sampling. They conducted experiments to analyze how the warm-up dataset size affects optimization results. The most significant finding was the comparison between Morgan fingerprints and MolFormer fingerprints at a constant data regime of 50 data points. The results showed that MolFormer fingerprints substantially outperformed Morgan fingerprints, suggesting that pre-trained models on large chemical spaces can potentially improve model optimization rates. This study aims to initiate broader discussions on how dataset sizes and sampling methodologies impact final optimization tasks in molecular property prediction.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_35.json b/_projects/json_summaries/project_35.json index 50540e0..9624224 100644 --- a/_projects/json_summaries/project_35.json +++ b/_projects/json_summaries/project_35.json @@ -2,6 +2,6 @@ "project_number": 35, "project_name": "Tutorial for GAUCHE - A Library for Gaussian Processes in Chemistry", "video_url": "https://x.com/Ryan__Rhys/status/1820723528469262419", - "summary": "This research project focuses on implementing input warping for Bayesian Optimization within the Gauche library, which was previously developed by the team and published at NeurIPS 2023. The primary innovation of Gauche is the introduction of Gaussian process (GP) kernels that enable modeling of discrete entities such as SMILES strings, graphs, and bit vectors, which are common representations in molecular sciences.The motivation behind using Gaussian processes for Bayesian Optimization is their suitability for automated tasks where fine-tuning for each problem is not feasible. GPs offer a good balance between performance and simplicity, with few trainable hyperparameters that can reliably converge on each iteration of the Bayesian Optimization loop. This makes them particularly attractive as surrogate models compared to more complex alternatives like deep neural networks, which might require careful monitoring during training at each iteration. The Gauche library extends the applicability of GPs to discrete input spaces, allowing for Bayesian Optimization over molecular representations. The project team has developed a range of tutorials and applications, including molecular property prediction, protein fitness prediction, and sparse GP regression, all available in the Gauche GitHub repository.", + "summary": "This research project focuses on implementing input warping for Bayesian Optimization within the Gauche library\\cite{griffiths_gauche_2024}, which was previously developed by the team and published at NeurIPS 2023. The primary innovation of Gauche is the introduction of Gaussian process (GP) kernels that enable modeling of discrete entities such as SMILES strings, graphs, and bit vectors, which are common representations in molecular sciences. The motivation behind using Gaussian processes for Bayesian Optimization is their suitability for automated tasks where fine-tuning for each problem is not feasible. GPs offer a good balance between performance and simplicity, with few trainable hyperparameters that can reliably converge on each iteration of the Bayesian Optimization loop. This makes them particularly attractive as surrogate models compared to more complex alternatives like deep neural networks, which might require careful monitoring during training at each iteration. The Gauche library extends the applicability of GPs to discrete input spaces, allowing for Bayesian Optimization over molecular representations. The project team has developed a range of tutorials and applications, including molecular property prediction, protein fitness prediction, and sparse GP regression, all available in the Gauche GitHub repository.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_36.json b/_projects/json_summaries/project_36.json index f9ad83e..2b56986 100644 --- a/_projects/json_summaries/project_36.json +++ b/_projects/json_summaries/project_36.json @@ -2,6 +2,6 @@ "project_number": 36, "project_name": "Scalable Nonmyopic Bayesian Optimization in Dynamic Cost Settings", "video_url": "https://youtu.be/CXweDiS_wbI", - "summary": "This research project focuses on scalable Bayesian optimization in dynamic settings, addressing limitations of previous approaches that rely on myopic acquisition functions and assume fixed cost structures. The researchers introduce a novel method using non-myopic acquisition functions that incorporate a look-ahead mechanism and dynamic cost functions.The project evaluates the proposed algorithm, named HBE, through two main experimental setups. First, they use synthetic functions across 14 different environments with varying dimensions to test scalability. Second, they apply the method to a real-world protein sequence design problem, aiming to maximize a protein score. The researchers compare their HBE algorithm against six other acquisition functions, including state-of-the-art methods. To enhance practicality, they integrate automatic hyperparameter tuning to reduce the number of optimization parameters. While specific results are not provided in the given context, the approach aims to overcome suboptimal resource allocation in dynamic cost experiments and improve upon existing Bayesian optimization techniques.", + "summary": "This research project focuses on scalable Bayesian optimization in dynamic settings, addressing limitations of previous approaches that rely on myopic acquisition functions and assume fixed cost structures. The researchers introduce a novel method using non-myopic acquisition functions\\cite{jiang_efficient_2020} that incorporate a look-ahead mechanism and dynamic cost functions. The project evaluates the proposed algorithm, named HBE, through two main experimental setups. First, they use synthetic functions across 14 different environments with varying dimensions to test scalability. Second, they apply the method to a real-world protein sequence design problem, aiming to maximize a protein score. The researchers compare their HBE algorithm against six other acquisition functions, including state-of-the-art methods. To enhance practicality, they integrate automatic hyperparameter tuning to reduce the number of optimization parameters. While specific results are not provided in the given context, the approach aims to overcome suboptimal resource allocation in dynamic cost experiments and improve upon existing Bayesian optimization techniques.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_39.json b/_projects/json_summaries/project_39.json index 15875ca..dde45ff 100644 --- a/_projects/json_summaries/project_39.json +++ b/_projects/json_summaries/project_39.json @@ -2,6 +2,6 @@ "project_number": 39, "project_name": "Divide and Conquer - Local Gaussian Processes to design Covalent Organic Frameworks for Methane Deliverable Capacity", "video_url": "https://www.youtube.com/watch?v=iog-07Ekp9g", - "summary": "This research project focuses on improving Bayesian Optimization (BO) for high-dimensional, large-scale datasets, specifically applied to the design of Covalent Organic Frameworks (COFs) for methane storage. The researchers developed a novel approach combining unsupervised clustering with local Gaussian Process (GP) models to enhance BO efficiency in the high data regime.The method begins by using K-means clustering to partition the dataset into distinct clusters. A portion of data from each cluster is then sampled to train separate local GP models. An epsilon-greedy algorithm is employed to determine which GP to train next. The researchers applied this approach to a COF dataset containing over 70,000 2D and 3D structures, assembled in silico from 666 organic linkers and four synthetic routes. The objective was to maximize methane storage performance, measured as deliverable capacity. The results demonstrated that their divide-and-conquer approach with local GP surrogates significantly outperformed a single GP model. While the single GP model reached a maximum deliverable capacity of 2.74 in 60 iterations, the proposed method surpassed this maximum within 5-10 iterations. Both methods retrained the GP model after 20 iterations and used 5% of the data for initial surrogate model training. This research highlights the potential of using local GP surrogates in combination with unsupervised clustering to perform more efficient Bayesian optimization in high-dimensional, large-scale datasets.", + "summary": "This research project focuses on improving Bayesian Optimization (BO) for high-dimensional, large-scale datasets, specifically applied to the design of Covalent Organic Frameworks (COFs) for methane storage\\cite{deshwal_bayesian_2021}. The researchers developed a novel approach combining unsupervised clustering with local Gaussian Process (GP) models to enhance BO efficiency in the high data regime. The method begins by using K-means clustering to partition the dataset into distinct clusters. A portion of data from each cluster is then sampled to train separate local GP models. An epsilon-greedy algorithm is employed to determine which GP to train next. The researchers applied this approach to a COF dataset containing over 70,000 2D and 3D structures, assembled in silico from 666 organic linkers and four synthetic routes. The objective was to maximize methane storage performance, measured as deliverable capacity. The results demonstrated that their divide-and-conquer approach with local GP surrogates significantly outperformed a single GP model. While the single GP model reached a maximum deliverable capacity of 2.74 in 60 iterations, the proposed method surpassed this maximum within 5-10 iterations. Both methods retrained the GP model after 20 iterations and used 5% of the data for initial surrogate model training. This research highlights the potential of using local GP surrogates in combination with unsupervised clustering to perform more efficient Bayesian optimization in high-dimensional, large-scale datasets.", "status": "success" } \ No newline at end of file diff --git a/_projects/json_summaries/project_44.json b/_projects/json_summaries/project_44.json index 8baf27e..c241a98 100644 --- a/_projects/json_summaries/project_44.json +++ b/_projects/json_summaries/project_44.json @@ -2,6 +2,6 @@ "project_number": 44, "project_name": "Rank-based Bayesian Optimization", "video_url": "https://www.youtube.com/watch?v=c84Sd2IwMAQ&ab_channel=GaryTom", - "summary": "This research project focused on using ranking models as surrogates in Bayesian optimization for materials discovery, specifically comparing ranking-based and conventional mean squared error (MSE) loss approaches. The key motivation was that in experimental campaigns, finding the best molecule is more important than accurately predicting absolute property values.The study employed a pairwise ranking loss (margin ranking loss) and a simple fully-connected multi-layer perceptron with three hidden layers and 100 nodes as the model architecture. Experiments were conducted on multiple datasets, including a solubility dataset (Delaney) and two datasets from Ali et al. with varying roughness. Results showed that the ranking loss model consistently outperformed the MSE-based model and random baseline in Bayesian optimization, acquiring more top-performing candidates within fewer evaluations. Interestingly, all models performed better on the smoother dataset, contrary to expectations. The study also found that model performance did not always correlate with its effectiveness as a surrogate in Bayesian optimization, as evidenced by the MSE model performing worse than the random baseline on the rougher dataset. The research highlights the potential of ranking-based models in overcoming overfitting issues common in Bayesian optimization with limited data points, particularly in materials discovery applications.", + "summary": "This research project focused on using ranking models as surrogates in Bayesian optimization for materials discovery, specifically comparing ranking-based and conventional mean squared error (MSE) loss approaches. The key motivation was that in experimental campaigns, finding the best molecule is more important than accurately predicting absolute property values, inspired by work on molecular pool-based active learning\\cite{graff_accelerating_2021}. The study employed a pairwise ranking loss (margin ranking loss) and a simple fully-connected multi-layer perceptron with three hidden layers and 100 nodes as the model architecture. Experiments were conducted on multiple datasets, including a solubility dataset (Delaney) and two datasets from Ali et al. with varying roughness\\cite{aldeghi_roughness_2022}. Results showed that the ranking loss model consistently outperformed the MSE-based model and random baseline in Bayesian optimization, acquiring more top-performing candidates within fewer evaluations. Interestingly, all models performed better on the smoother dataset, contrary to expectations. The study also found that model performance did not always correlate with its effectiveness as a surrogate in Bayesian optimization, as evidenced by the MSE model performing worse than the random baseline on the rougher dataset. The research highlights the potential of ranking-based models in overcoming overfitting issues common in Bayesian optimization with limited data points, particularly in materials discovery applications.", "status": "success" } \ No newline at end of file diff --git a/copilot-main-diff.pdf b/copilot-main-diff.pdf new file mode 100644 index 0000000..5e1ce9d Binary files /dev/null and b/copilot-main-diff.pdf differ diff --git a/copilot-main-fixed.pdf b/copilot-main-fixed.pdf new file mode 100644 index 0000000..2ff98c9 Binary files /dev/null and b/copilot-main-fixed.pdf differ diff --git a/latex/authors-hardcoded.tex b/latex/authors-hardcoded.tex index 51dcd37..c9ffb8f 100644 --- a/latex/authors-hardcoded.tex +++ b/latex/authors-hardcoded.tex @@ -240,7 +240,7 @@ \author{Michail~Mitsakis} % \email{mitsakismichail@gmail.com} -\affiliation{Technical University of Denmark (DTU), Argous 64 & Potamou 77, Kifissia, Athens, Greece.} +\affiliation{Technical University of Denmark (DTU), Argous 64 \& Potamou 77, Kifissia, Athens, Greece.} \author{Cameron~Movassaghi} % \email{csmova@g.ucla.edu} @@ -347,7 +347,7 @@ \author{Arifin~San} % \email{fin_ari@jsr.co.jp} -\affiliation{JSR Corporation, JSR Bioscience and informatics R&D center (JSR BiRD), 3-103-9 Tonomachi, Kawasaki-ku, Kawasaki, Kanagawa 210-0821, Japan.} +\affiliation{JSR Corporation, JSR Bioscience and informatics R\&D center (JSR BiRD), 3-103-9 Tonomachi, Kawasaki-ku, Kawasaki, Kanagawa 210-0821, Japan.} \author{Christina~Schenk} % \email{christina.schenk@imdea.org} diff --git a/latex/references.bib b/latex/references.bib index 5f74b0d..cdd63eb 100644 --- a/latex/references.bib +++ b/latex/references.bib @@ -131,3 +131,580 @@ @article{wallum_instrument_2023 note = {Publisher: American Chemical Society}, pages = {1866--1876}, } +% Source: project-24-ScattBO.md +@article{szymanski_autonomous_2023, + title = {An autonomous laboratory for the accelerated synthesis of novel materials}, + volume = {624}, + issn = {1476-4687}, + doi = {10.1038/s41586-023-06739-5}, + journal = {Nature}, + author = {Szymanski, Nathan J. and Zeng, Yan and Huo, Haotian and Bartel, Christopher J. and Kim, Hyungkyu and Parija, Abhishek and Shoghi, Nikita and Rigas, Savvas and Neaton, Jeffrey B. and Ceder, Gerbrand and Persson, Kristin A. and Toma, Francesca M. and Gregoire, John M.}, + year = {2023}, + pages = {86--91}, +} + +% Source: project-24-ScattBO.md +@article{johansen_gpu-accelerated_2024, + title = {A {GPU}-{Accelerated} {Open}-{Source} {Python} {Package} for {Calculating} {Powder} {Diffraction}, {Small}-{Angle}-, and {Total} {Scattering} with the {Debye} {Scattering} {Equation}}, + volume = {9}, + issn = {2475-9066}, + doi = {10.21105/joss.06024}, + journal = {Journal of Open Source Software}, + author = {Johansen, Frederik L. and Anker, Andy S. and Christiansen, Thomas L. and Voss, Logan S. and Juhás, Pavol and Billinge, Simon J.L. and Jensen, Kirsten M.Ø.}, + year = {2024}, + pages = {6024}, +} + +% Source: project-24-ScattBO.md +@article{leeman_challenges_2024, + title = {Challenges in {High}-{Throughput} {Inorganic} {Materials} {Prediction} and {Autonomous} {Synthesis}}, + volume = {3}, + issn = {2768-5608}, + doi = {10.1103/PRXEnergy.3.011002}, + journal = {PRX Energy}, + author = {Leeman, Josh and Kunitsa, Alexander A. and Vecchio, Kevin S. and Ceder, Gerbrand and Kusne, A. Gilad}, + year = {2024}, + pages = {011002}, +} + +% Source: project-07-surface-science-syndicate.md +@article{wurger_exploring_2021, + title = {Exploring structure-property relationships in magnesium dissolution modulators}, + volume = {5}, + issn = {2397-2106}, + doi = {10.1038/s41529-020-00148-z}, + journal = {npj Materials Degradation}, + author = {Würger, Tim and Mei, Di and Vaghefinazari, Bahram and Feiler, Christian and Rohwerder, Michael and Zheludkevich, Mikhail L.}, + year = {2021}, + pages = {2}, +} + +@article{ozkan_laying_2024, + title = {Laying the experimental foundation for corrosion inhibitor discovery through machine learning}, + volume = {8}, + issn = {2397-2106}, + doi = {10.1038/s41529-024-00435-z}, + journal = {npj Materials Degradation}, + author = {Özkan, Can and Sahlmann, Lennart and Feiler, Christian and Zheludkevich, Mikhail L. and Würger, Tim}, + year = {2024}, + pages = {21}, +} + +@article{azam_pretraining_2024, + title = {Pretraining {Probabilistic} {Models} for {Scalable} {Precision} {Agriculture}}, + url = {https://openreview.net/pdf?id=tPPm5OKdFy}, + journal = {ICLR 2024 Workshop on Data-centric Machine Learning Research}, + author = {Azam, Ruhana and Truong, Sang T. and Fernandes, Samuel B. and Leakey, Andrew D.B. and Lipka, Alexander and El-Kebir, Mohammed and Koyejo, Sanmi}, + year = {2024}, +} + +@article{graff_accelerating_2021, + title = {Accelerating high-throughput virtual screening through molecular pool-based active learning}, + volume = {12}, + issn = {2041-6539}, + doi = {10.1039/d1sc02482a}, + number = {22}, + journal = {Chemical Science}, + author = {Graff, David E. and Shakhnovich, Eugene I. and Coley, Connor W.}, + year = {2021}, + pages = {7866--7881}, +} + +@article{aldeghi_roughness_2022, + title = {Roughness of {Molecular} {Property} {Landscapes} and {Its} {Impact} on {Modellability}}, + volume = {62}, + issn = {1549-9596}, + doi = {10.1021/acs.jcim.2c00792}, + number = {19}, + journal = {Journal of Chemical Information and Modeling}, + author = {Aldeghi, Matteo and Häse, Florian and Hickman, Riley J. and Tamblyn, Isaac and Aspuru-Guzik, Alán}, + year = {2022}, + pages = {4660--4671}, +} + +@article{ramakrishnan_quantum_2014, + title = {Quantum chemistry structures and properties of 134 kilo molecules}, + volume = {1}, + issn = {2052-4463}, + doi = {10.1038/sdata.2014.22}, + journal = {Scientific Data}, + author = {Ramakrishnan, Raghunathan and Dral, Pavlo O. and Rupp, Matthias and von Lilienfeld, O. Anatole}, + year = {2014}, + pages = {140022}, +} + +@article{ansari_learning_2023, + title = {Learning {Peptide} {Properties} with {Positive} {Examples} {Only}}, + url = {https://doi.org/10.1101/2023.06.01.543289}, + journal = {bioRxiv}, + author = {Ansari, Mehrad and White, Andrew D.}, + year = {2023}, +} + +@article{frazier_tutorial_2018, + title = {A {Tutorial} on {Bayesian} {Optimization}}, + url = {http://arxiv.org/abs/1807.02811}, + journal = {arXiv}, + author = {Frazier, Peter I.}, + year = {2018}, +} + +@article{shahriari_taking_2016, + title = {Taking the {Human} out of the {Loop}: {A} {Review} of {Bayesian} {Optimization}}, + volume = {104}, + issn = {0018-9219}, + shorttitle = {Taking the {Human} out of the {Loop}}, + doi = {10.1109/JPROC.2015.2494218}, + number = {1}, + journal = {Proceedings of the IEEE}, + author = {Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P. and de Freitas, Nando}, + year = {2016}, + pages = {148--175}, +} + +@article{wang_bayesian_2022, + title = {Bayesian {Optimization} for {Chemical} {Products} and {Functional} {Materials}}, + volume = {36}, + issn = {2211-3398}, + doi = {10.1016/j.coche.2021.100728}, + journal = {Current Opinion in Chemical Engineering}, + author = {Wang, Ke and Dowling, Alexander W.}, + year = {2022}, + pages = {100728}, +} + +@article{lewis_retrieval-augmented_2020, + title = {Retrieval-augmented generation for knowledge-intensive nlp tasks}, + volume = {33}, + journal = {Advances in Neural Information Processing Systems}, + author = {Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and Karpukhin, Vladimir and Goyal, Naman and Küttler, Heinrich and Lewis, Mike and Yih, Wen-tau and Rocktäschel, Tim and Riedel, Sebastian and Kiela, Douwe}, + year = {2020}, + pages = {9459--9474}, +} + +@misc{fitzner_baybe_2022, + author = {Martin Fitzner and Adrian {\v{S}}o{\v{s}}i{\'{c}} and Alexander Hopp and Alex Lee}, + title = {{BayBE} -- a Bayesian back end for design of experiments}, + year = {2022}, + howpublished = {\url{https://github.com/emdgroup/baybe}}, + note = {Accessed: 2024-05-28} +} + +@article{altamirano_robust_2023, + title = {Robust and conjugate {Gaussian} process regression}, + url = {https://arxiv.org/abs/2311.00463}, + journal = {arXiv preprint arXiv:2311.00463}, + author = {Altamirano, Matias and Briol, François-Xavier and Knoblauch, Jeremias}, + year = {2023}, +} + +@inproceedings{ament_sustainable_2023, + title = {Sustainable concrete via bayesian optimization}, + url = {https://arxiv.org/abs/2310.18288}, + booktitle = {NeurIPS 2023 Workshop on Adaptive Experimentation in the Real World}, + author = {Ament, Sebastian and Witte, Andrew and Garg, Nishant and Kusuma, Julius}, + year = {2023}, +} + +@article{lilienfeld_retrospective_2020, + title = {Retrospective on a decade of machine learning for chemical discovery}, + volume = {11}, + issn = {2041-1723}, + doi = {10.1038/s41467-020-18556-9}, + number = {1}, + journal = {Nature Communications}, + author = {von Lilienfeld, Anatole and Burke, Kieron}, + year = {2020}, +} + +@inproceedings{lin_preference_2022, + title = {Preference {Exploration} for {Efficient} {Bayesian} {Optimization} with {Multiple} {Outcomes}}, + url = {https://proceedings.mlr.press/v151/jerry-lin22a}, + booktitle = {Proceedings of The 25th International Conference on Artificial Intelligence and Statistics}, + publisher = {PMLR}, + author = {Lin, Z Jerry and Astudillo, Raul and Frazier, Peter and Bakshy, Eytan}, + year = {2022}, + pages = {4235--4258}, +} + +@article{lozano-blanco_single-event_2008, + title = {Single-{Event} {Microkinetic} {Model} for {Fischer}−{Tropsch} {Synthesis} on {Iron}-{Based} {Catalysts}}, + volume = {47}, + issn = {0888-5885}, + doi = {10.1021/ie071587u}, + number = {16}, + journal = {Industrial \& Engineering Chemistry Research}, + author = {Lozano-Blanco, Gisela and Thybaut, Joris W. and Surla, Karine and Galtier, Pierre and Marin, Guy B.}, + year = {2008}, + pages = {5879--5891}, +} + +@article{chakkingal_multi-output_2022, + title = {Multi-output machine learning models for kinetic data evaluation : {A} {Fischer}–{Tropsch} synthesis case study}, + volume = {446}, + issn = {1385-8947}, + shorttitle = {Multi-output machine learning models for kinetic data evaluation}, + doi = {10.1016/j.cej.2022.137186}, + journal = {Chemical Engineering Journal}, + author = {Chakkingal, Anoop and Janssens, Pieter and Poissonnier, Jeroen and Virginie, Mirella and Khodakov, Andrei Y. and Thybaut, Joris W.}, + year = {2022}, + pages = {137186}, +} + +@article{qin_large_2023, + title = {Large {Language} {Models} are {Effective} {Text} {Rankers} with {Pairwise} {Ranking} {Prompting}}, + url = {https://arxiv.org/abs/2306.17563}, + doi = {10.48550/arXiv.2306.17563}, + journal = {arXiv preprint arXiv:2306.17563}, + author = {Qin, Zhen and Jagerman, Rolf and Hui, Kai and Zhuang, Honglei and Wu, Junru and Zhou, Jianing and Chen, Tao and Croft, W. Bruce and Wang, Dawei}, + year = {2023}, +} + +@article{deshwal_bayesian_2021, + title = {Bayesian {Optimization} of {Nanoporous} {Materials}}, + volume = {6}, + issn = {2058-9689}, + doi = {10.1039/D1ME00093D}, + number = {12}, + journal = {Molecular Systems Design \& Engineering}, + author = {Deshwal, Aryan and Simon, Cory M. and Doppa, Janardhan Rao}, + year = {2021}, + pages = {1066--1086}, +} + +@article{bickerton_quantifying_2012, + title = {Quantifying the {Chemical} {Beauty} of {Drugs}}, + volume = {4}, + issn = {1755-4330}, + doi = {10.1038/nchem.1243}, + number = {2}, + journal = {Nature Chemistry}, + author = {Bickerton, G. Richard and Paolini, Gaia V. and Besnard, Jérémy and Muresan, Sorel and Hopkins, Andrew L.}, + year = {2012}, + pages = {90--98}, +} + +@article{garcia-ortegon_dockstring_2022, + title = {{DOCKSTRING}: {Easy} {Molecular} {Docking} {Yields} {Better} {Benchmarks} for {Ligand} {Design}}, + volume = {62}, + issn = {1549-9596}, + shorttitle = {{DOCKSTRING}}, + doi = {10.1021/acs.jcim.1c01334}, + number = {15}, + journal = {Journal of Chemical Information and Modeling}, + author = {García-Ortegón, Melvin and Simm, Gregor N. C. and Tripp, Andrew J. and Hernández-Lobato, José Miguel and Bender, Andreas and Bacallado, Sergio}, + year = {2022}, + pages = {3486--3502}, +} + +@article{angello_convergent_2022, + title = {Convergent artificial molecular integration for accelerated discovery}, + volume = {378}, + issn = {0036-8075, 1095-9203}, + doi = {10.1126/science.adc8743}, + number = {6618}, + journal = {Science}, + author = {Angello, Nicholas H. and Chu, Susan H. and Mirica, Katherine A. and Lewis, Nathan S.}, + year = {2022}, + pages = {399--404}, +} + +@article{wang_autonomous_2024, + title = {Autonomous materials research systems for accelerated electrochemical synthesis and discovery}, + volume = {626}, + issn = {1476-4687}, + doi = {10.1038/s41586-024-07021-y}, + number = {8000}, + journal = {Nature}, + author = {Wang, Jiale Y. and Li, Jiaxun and Wang, Yan and Johnston, Kate E. and Gani, Terry Z. H. and Bilodeau, Robert D. and Tayvah, Uriel and Andersen, Thomas I. and Berlinger, Sebastian A. and Chubak, Iryna and de Poel, Wester and Murray, Ethel and Ovchinnikov, Mikhail and Zhang, Fengyu and Yu, Nora M. and Sargent, Edward H. and Aspuru-Guzik, Alán}, + year = {2024}, + pages = {1025--1031}, +} + +@article{reker_active-learning_2015, + title = {Active-learning strategies in computer-assisted drug discovery}, + volume = {20}, + issn = {1359-6446}, + doi = {10.1016/j.drudis.2014.12.004}, + number = {4}, + journal = {Drug Discovery Today}, + author = {Reker, Daniel and Schneider, Gisbert}, + year = {2015}, + pages = {458--465}, +} + +@article{xu_high-throughput_2020, + title = {High-{Throughput} {Synthesis}, {Analysis}, and {Optimization} of {Injectable} {Hydrogels} for {Protein} {Delivery}}, + volume = {21}, + issn = {1525-7797}, + doi = {10.1021/acs.biomac.9b01132}, + number = {3}, + journal = {Biomacromolecules}, + author = {Xu, Jennifer and Feng, Qingyu and Lin, Shengfu and Yuan, Wei and Li, Ruijie and Li, Juan and Wei, Kai and Chen, Xiaoxuan and Yu, Kun and Liu, Jie and Zhang, Kun and Guo, Zhuang and Wang, Minghao and Li, Fubing and Li, Jiayu and Bian, Liming}, + year = {2020}, + pages = {1128--1133}, +} + +@article{krajina_dynamic_2017, + title = {Dynamic {Light} {Scattering} {Microrheology} {Reveals} {Multiscale} {Viscoelasticity} of {Polymer} {Gels} and {Precious} {Biological} {Materials}}, + volume = {3}, + issn = {2374-7943}, + doi = {10.1021/acscentsci.7b00449}, + number = {12}, + journal = {ACS Central Science}, + author = {Krajina, Brad A. and Tropini, Carolina and Zhu, Alvin and DiGiacomo, Paul and Sonnenburg, Justin L. and Heilshorn, Sarah C. and Spakowitz, Andrew J.}, + year = {2017}, + pages = {1294--1303}, +} + +@article{sanchez-lengeling_bayesian_2018, + title = {A {Bayesian} {Approach} to {Predict} {Solubility} {Parameters}}, + volume = {2}, + issn = {2513-0390}, + doi = {10.1002/adts.201800069}, + number = {1}, + journal = {Advanced Theory and Simulations}, + author = {Sanchez-Lengeling, Benjamin and Roch, Louis M. and Perea, José García and Langner, Stefan and Brabec, Christoph and Aspuru-Guzik, Alán}, + year = {2018}, + pages = {1800069}, +} + +@article{rapp_quantum_2024, + title = {Quantum {Gaussian} process regression for {Bayesian} optimization}, + volume = {6}, + issn = {2524-4914}, + doi = {10.1007/s42484-023-00138-9}, + journal = {Quantum Machine Intelligence}, + author = {Rapp, Felix and Roth, Martin}, + year = {2024}, + pages = {5}, +} + +@article{bellamy_batched_2022, + title = {Batched {Bayesian} {Optimization} for {Drug} {Design} in {Noisy} {Environments}}, + volume = {62}, + issn = {1549-9596}, + doi = {10.1021/acs.jcim.2c00602}, + number = {17}, + journal = {Journal of Chemical Information and Modeling}, + author = {Bellamy, Hugo and Rehim, Abbi Abdel and Orhobor, Oghenejokpeme I. and King, Ross}, + year = {2022}, + pages = {3970--3981}, +} + +@article{rube_prediction_2022, + title = {Prediction of protein–ligand binding affinity from sequencing data with interpretable machine learning}, + volume = {40}, + issn = {1546-1696}, + doi = {10.1038/s41587-022-01307-0}, + number = {10}, + journal = {Nature Biotechnology}, + author = {Rube, H. Tomas and Rastogi, Cyrus and Feng, Song and Kribelbauer, Judith F. and Petrovic, Jonathan and Bussemaker, Harmen J.}, + year = {2022}, + pages = {1520--1527}, +} + +@article{felton_summit_2021, + title = {Summit: benchmarking machine learning methods for reaction optimisation}, + volume = {1}, + issn = {2628-6645}, + shorttitle = {Summit}, + doi = {10.1002/cmtd.202000051}, + number = {2}, + journal = {Chemistry–Methods}, + author = {Felton, Kirk C. and Rittig, Jan G. and Lapkin, Alexei A.}, + year = {2021}, + pages = {116--122}, +} + +@article{hone_rapid_2017, + title = {Rapid multistep kinetic model generation from transient flow data}, + volume = {2}, + issn = {2058-9883}, + doi = {10.1039/C6RE00109B}, + number = {2}, + journal = {Reaction Chemistry \& Engineering}, + author = {Hone, Christopher A. and Holmes, Nicholas and Akien, Geoffrey R. and Bourne, Richard A. and Muller, Ferenc L.}, + year = {2017}, + pages = {103--108}, +} + +@article{sorourifar_accelerating_2024, + title = {Accelerating {Black}-{Box} {Molecular} {Property} {Optimization} by {Adaptively} {Learning} {Sparse} {Subspaces}}, + url = {http://arxiv.org/abs/2401.01398}, + journal = {arXiv preprint arXiv:2401.01398}, + author = {Sorourifar, Fani and Banker, Tushar and Paulson, Joel A.}, + year = {2024}, +} + +@article{maus_local_2023, + title = {Local {Latent} {Space} {Bayesian} {Optimization} over {Structured} {Inputs}}, + url = {http://arxiv.org/abs/2201.11872}, + journal = {arXiv preprint arXiv:2201.11872}, + author = {Maus, Nicolas and Jones, Henry T. and Moore, Jeffrey S. and Kusner, Matt J. and Bradshaw, John and Gardner, Jacob R.}, + year = {2023}, +} + +@article{dusselier_small-pore_2018, + title = {Small-{Pore} {Zeolites}: {Synthesis} and {Catalysis}}, + volume = {118}, + issn = {0009-2665}, + shorttitle = {Small-{Pore} {Zeolites}}, + doi = {10.1021/acs.chemrev.7b00738}, + number = {11}, + journal = {Chemical Reviews}, + author = {Dusselier, Michiel and Davis, Mark E.}, + year = {2018}, + pages = {5265--5329}, +} + +@article{mallette_current_2024, + title = {The {Current} {Understanding} of {Mechanistic} {Pathways} in {Zeolite} {Crystallization}}, + issn = {0009-2665}, + doi = {10.1021/acs.chemrev.3c00801}, + journal = {Chemical Reviews}, + author = {Mallette, Alexander J. and Shilpa, Kotni and Rimer, Jeffrey D.}, + year = {2024}, +} + +@misc{mqs_cosmo_2024, + title = {COSMO-SAC: A Powerful Tool for Solvent Selection in Pharmaceutical Drug Development}, + howpublished = {\url{https://blog.mqs.dk/posts/10_cosmo/10_cosmo/}}, + author = {MQS}, + year = {2024}, + note = {Accessed: 2024-05-28}, +} + +% Source: project-21-The_OSU_ChemEs.md +@article{sorourifar_accelerating_2024, + title = {Accelerating Black-Box Molecular Property Optimization by Adaptively Learning Sparse Subspaces}, + journal = {arXiv [q-Bio.BM]}, + author = {Sorourifar, F. and Banker, T. and Paulson, J. A.}, + year = {2024}, + howpublished = {\url{http://arxiv.org/abs/2401.01398}}, +} + +% Source: project-21-The_OSU_ChemEs.md +@article{maus_local_2023, + title = {Local Latent Space Bayesian Optimization over Structured Inputs}, + journal = {arXiv [Cs.LG]}, + author = {Maus, N. and Jones, H. T. and Moore, J. S. and Kusner, M. J. and Bradshaw, J. and Gardner, J. R.}, + year = {2023}, + howpublished = {\url{http://arxiv.org/abs/2201.11872}}, +} + +% Source: project-22-chemical-similarity-EMD-kernel-BO.md +@article{moss_gaussian_2020, + title = {Gaussian process molecule property prediction with flowmo}, + journal = {arXiv preprint arXiv:2010.01118}, + author = {Moss, Henry B. and Griffiths, Ryan-Rhys}, + year = {2020}, +} + +% Source: project-22-chemical-similarity-EMD-kernel-BO.md +@article{hargreaves_earth_2020, + title = {The earth mover's distance as a metric for the space of inorganic compositions}, + volume = {32}, + number = {24}, + journal = {Chemistry of Materials}, + author = {Hargreaves, Cameron J. and Dyer, Matthew S. and Gaultois, Michael W. and Kurlin, Vitaliy A. and Rosseinsky, Matthew J.}, + year = {2020}, + pages = {10610--10620}, +} + +% Source: project-25-mcgill-denovo.md +@article{bickerton_quantifying_2012, + title = {Quantifying the Chemical Beauty of Drugs}, + volume = {4}, + number = {2}, + journal = {Nature chemistry}, + author = {Bickerton, G. R. and Paolini, G. V. and Besnard, J. and Muresan, S. and Hopkins, A. L.}, + year = {2012}, + pages = {90--98}, + doi = {10.1038/nchem.1243}, +} + +% Source: project-25-mcgill-denovo.md +@article{garcia_dockstring_2022, + title = {DOCKSTRING: Easy Molecular Docking Yields Better Benchmarks for Ligand Design}, + volume = {62}, + number = {15}, + journal = {Journal of Chemical Information and Modeling}, + author = {García-Ortegón, M. and Simm, G. N. C. and Tripp, A. J. and Hernández-Lobato, J. M. and Bender, A. and Bacallado, S.}, + year = {2022}, + pages = {3486--3502}, + doi = {10.1021/acs.jcim.1c01334}, +} + +% Source: project-27-bayes-warmup.md +@article{ruddigkeit_enumeration_2012, + title = {Enumeration of 166 billion organic small molecules in the chemical universe database GDB-17}, + volume = {52}, + journal = {J. Chem. Inf. Model.}, + author = {Ruddigkeit, L. and van Deursen, R. and Blum, L. C. and Reymond, J.-L.}, + year = {2012}, + pages = {2864--2875}, +} + +% Source: project-27-bayes-warmup.md +@article{ramakrishnan_quantum_2014, + title = {Quantum chemistry structures and properties of 134 kilo molecules}, + volume = {1}, + journal = {Scientific Data}, + author = {Ramakrishnan, R. and Dral, P. O. and Rupp, M. and von Lilienfeld, O. A.}, + year = {2014}, + pages = {140022}, +} + +% Source: project-35-gauche.md +@inproceedings{griffiths_gauche_2024, + title = {Gauche: A library for Gaussian processes in chemistry}, + volume = {36}, + booktitle = {Advances in Neural Information Processing Systems}, + author = {Griffiths, Ryan-Rhys and Klarner, Leo and Moss, Henry and Ravuri, Aditya and Truong, Sang and Du, Yuanqi and Stanton, Samuel and others}, + year = {2024}, +} + +% Source: project-36-nonmyopic.md +@inproceedings{jiang_efficient_2020, + title = {Efficient nonmyopic Bayesian optimization via one-shot multi-step trees}, + volume = {33}, + booktitle = {Advances in Neural Information Processing Systems}, + author = {Jiang, Shali and Jiang, Daniel and Balandat, Maximilian and Karrer, Brian and Gardner, Jacob and Garnett, Roman}, + year = {2020}, + pages = {18039--18049}, +} + +% Source: project-39-localGPS_for_COF.md +@article{deshwal_bayesian_2021, + title = {Bayesian Optimization of Nanoporous Materials}, + volume = {6}, + number = {12}, + journal = {Mol. Syst. Des. Eng.}, + author = {Deshwal, A. and Simon, C. M. and Doppa, J. R.}, + year = {2021}, + pages = {1066--1086}, + doi = {10.1039/D1ME00093D}, +} + +% Source: project-44-rank-bo.md +@article{graff_accelerating_2021, + title = {Accelerating high-throughput virtual screening through molecular pool-based active learning}, + volume = {12}, + number = {22}, + journal = {Chemical science}, + author = {Graff, David E. and Shakhnovich, Eugene I. and Coley, Connor W.}, + year = {2021}, + pages = {7866--7881}, +} + +% Source: project-44-rank-bo.md +@article{aldeghi_roughness_2022, + title = {Roughness of molecular property landscapes and its impact on modellability}, + volume = {62}, + number = {19}, + journal = {Journal of Chemical Information and Modeling}, + author = {Aldeghi, Matteo and others}, + year = {2022}, + pages = {4660--4671}, +} \ No newline at end of file diff --git a/main_final_diffNotes.bib b/main_final_diffNotes.bib new file mode 100644 index 0000000..8f3dc15 --- /dev/null +++ b/main_final_diffNotes.bib @@ -0,0 +1,2 @@ +@CONTROL{REVTEX42Control} +@CONTROL{apsrev42Control,author="08",editor="1",pages="0",title="0",year="1"} diff --git a/python_scripts/process_summaries.py b/python_scripts/process_summaries.py index 4af8462..6ca2519 100644 --- a/python_scripts/process_summaries.py +++ b/python_scripts/process_summaries.py @@ -2,14 +2,29 @@ import os -def clean_text(text): - return ( - text.replace("&", "\\&") - .replace("%", "\\%") - .replace("#", "\\#") - .replace("\_", "\\_") - .replace("^", "\\textasciicircum{}") - ) # .replace('{', '\\{').replace('}', '\\}').replace('~', '\\textasciitilde{}') +def clean_text(text): + import re + # First escape other characters + text = ( + text.replace("&", "\\&") + .replace("%", "\\%") + .replace("#", "\\#") + .replace("^", "\\textasciicircum{}") + ) + + # Handle underscores carefully - don't escape them inside \cite{} commands + # Use regex to find \cite{...} blocks and protect them + def protect_citations(match): + return match.group(0) # Return citation as-is + + # Split text by citation blocks and process each part + parts = re.split(r'(\\cite\{[^}]*\})', text) + for i in range(len(parts)): + if not parts[i].startswith('\\cite{'): + # This is regular text, escape underscores + parts[i] = parts[i].replace("_", "\\_") + + return ''.join(parts) # def gen_summary(input_dir): @@ -35,88 +50,66 @@ def clean_text(text): # print(" ".join(sections)) -def gen_summary(input_dir): - sections = [] - entries = [] - - for filename in os.listdir(input_dir): - if filename.endswith(".json"): - with open(os.path.join(input_dir, filename), "r", encoding="utf-8") as file: - data = json.load(file) - project_number = clean_text( - str(data.get("project_number", "Unknown Project")) - ) - project_name = clean_text(data.get("project_name", "Unknown Project")) - summary = clean_text(data.get("summary", "No summary available.")) - video_url = data.get("video_url", "") - - entries.append((project_number, project_name, summary, video_url)) - - try: - entries.sort(key=lambda x: int(x[0])) - except ValueError: - entries.sort(key=lambda x: x[0]) - - for project_number, project_name, summary, video_url in entries: - title = f"Project {project_number}: {project_name}" - if video_url: - section_title = f"\\subsection*{{\\href{{{video_url}}}{{{title}}}}}" - else: - section_title = f"\\subsection*{{{title}}}" - - section = f"{section_title}\n\n{summary}\n" - sections.append(section) - - print(" ".join(sections)) - - # Check if directory exists - if not os.path.exists(input_dir): - return ( - "% JSON directory not found at: " - + input_dir - + "\n" - + "\\subsection*{Project Summaries Not Available}\n\n" - + "The project summaries will be available in the final version of the document. " - + "Please ensure the JSON directory exists and contains the necessary files." - ) - - try: - for filename in os.listdir(input_dir): - if filename.endswith(".json"): - try: - with open( - os.path.join(input_dir, filename), "r", encoding="utf-8" - ) as file: - data = json.load(file) - project_name = clean_text( - data.get("project_name", "Unknown Project") - ) - summary = clean_text( - data.get("summary", "No summary available.") - ) - video_url = data.get("video_url", "") - - if video_url: - section_title = f"\\subsection*{{\\href{{{video_url}}}{{{project_name}}}}}" - else: - section_title = f"\\subsection*{{{project_name}}}" - - section = f"{section_title}\n\n{summary}\n" - sections.append(section) - except Exception as e: - sections.append( - f"\\subsection*{{Error Processing {filename}}}\n\nError: {e}\n" - ) - except Exception as e: - return ( - f"% Error accessing JSON directory: {str(e)}\n" - + "\\subsection*{Error Processing Summaries}\n\n" - + f"An error occurred while processing the project summaries: {str(e)}" - ) - - if not sections: - return "\\subsection*{No Project Summaries Found}\n\nNo project summary files were found in the specified directory." - +def gen_summary(input_dir): + # Check if directory exists + if not os.path.exists(input_dir): + return ( + "% JSON directory not found at: " + + input_dir + + "\n" + + "\\subsection*{Project Summaries Not Available}\n\n" + + "The project summaries will be available in the final version of the document. " + + "Please ensure the JSON directory exists and contains the necessary files." + ) + + sections = [] + entries = [] + + try: + for filename in os.listdir(input_dir): + if filename.endswith(".json"): + try: + with open(os.path.join(input_dir, filename), "r", encoding="utf-8") as file: + data = json.load(file) + project_number = clean_text( + str(data.get("project_number", "Unknown Project")) + ) + project_name = clean_text(data.get("project_name", "Unknown Project")) + summary = clean_text(data.get("summary", "No summary available.")) + video_url = data.get("video_url", "") + + entries.append((project_number, project_name, summary, video_url)) + except Exception as e: + sections.append( + f"\\subsection*{{Error Processing {filename}}}\n\nError: {e}\n" + ) + except Exception as e: + return ( + f"% Error accessing JSON directory: {str(e)}\n" + + "\\subsection*{Error Processing Summaries}\n\n" + + f"An error occurred while processing the project summaries: {str(e)}" + ) + + # Sort entries by project number + try: + entries.sort(key=lambda x: int(x[0])) + except ValueError: + entries.sort(key=lambda x: x[0]) + + # Generate sections + for project_number, project_name, summary, video_url in entries: + title = f"Project {project_number}: {project_name}" + if video_url: + section_title = f"\\subsection*{{\\href{{{video_url}}}{{{title}}}}}" + else: + section_title = f"\\subsection*{{{title}}}" + + section = f"{section_title}\n\n{summary}\n" + sections.append(section) + + if not sections: + return "\\subsection*{No Project Summaries Found}\n\nNo project summary files were found in the specified directory." + return " ".join(sections)