Clinical research relies on high-quality patient data, however,
obtaining big data sets is costly and access to existing data is
often hindered by privacy and regulatory concerns. Synthetic data
generation holds the promise of effectively bypassing these
boundaries allowing for simplified data accessibility and the
prospect of synthetic control cohorts. We employed two different
methodologies of generative artificial intelligence - CTAB-GAN+
and normalizing flows (NFlow) - to synthesize patient data
derived from 1606 patients with acute myeloid leukemia, a
heterogeneous hematological malignancy, that were treated within
four multicenter clinical trials. Both generative models
accurately captured distributions of demographic, laboratory,
molecular and cytogenetic variables, as well as patient outcomes
yielding high performance scores regarding fidelity and usability
of both synthetic cohorts (n = 1606 each). Survival analysis
demonstrated close resemblance of survival curves between
original and synthetic cohorts. Inter-variable relationships were
preserved in univariable outcome analysis enabling explorative
analysis in our synthetic data. Additionally, training sample
privacy is safeguarded mitigating possible patient
re-identification, which we quantified using Hamming distances.
We provide not only a proof-of-concept for synthetic data
generation in multimodal clinical data for rare diseases, but
also full public access to synthetic data sets to foster further
research.
%0 Journal Article
%1 Eckardt2024-jt
%A Eckardt, Jan-Niklas
%A Hahn, Waldemar
%A Röllig, Christoph
%A Stasik, Sebastian
%A Platzbecker, Uwe
%A Müller-Tidow, Carsten
%A Serve, Hubert
%A Baldus, Claudia D
%A Schliemann, Christoph
%A Schäfer-Eckart, Kerstin
%A Hanoun, Maher
%A Kaufmann, Martin
%A Burchert, Andreas
%A Thiede, Christian
%A Schetelig, Johannes
%A Sedlmayr, Martin
%A Bornhäuser, Martin
%A Wolfien, Markus
%A Middeke, Jan Moritz
%D 2024
%J NPJ Digit. Med.
%K ai artificial clinical generative intelligence leukemia myeloid synthetic trials yaff
%N 1
%P 76
%T Mimicking clinical trials with synthetic acute myeloid leukemia patients using generative artificial intelligence
%V 7
%X Clinical research relies on high-quality patient data, however,
obtaining big data sets is costly and access to existing data is
often hindered by privacy and regulatory concerns. Synthetic data
generation holds the promise of effectively bypassing these
boundaries allowing for simplified data accessibility and the
prospect of synthetic control cohorts. We employed two different
methodologies of generative artificial intelligence - CTAB-GAN+
and normalizing flows (NFlow) - to synthesize patient data
derived from 1606 patients with acute myeloid leukemia, a
heterogeneous hematological malignancy, that were treated within
four multicenter clinical trials. Both generative models
accurately captured distributions of demographic, laboratory,
molecular and cytogenetic variables, as well as patient outcomes
yielding high performance scores regarding fidelity and usability
of both synthetic cohorts (n = 1606 each). Survival analysis
demonstrated close resemblance of survival curves between
original and synthetic cohorts. Inter-variable relationships were
preserved in univariable outcome analysis enabling explorative
analysis in our synthetic data. Additionally, training sample
privacy is safeguarded mitigating possible patient
re-identification, which we quantified using Hamming distances.
We provide not only a proof-of-concept for synthetic data
generation in multimodal clinical data for rare diseases, but
also full public access to synthetic data sets to foster further
research.
@article{Eckardt2024-jt,
abstract = {Clinical research relies on high-quality patient data, however,
obtaining big data sets is costly and access to existing data is
often hindered by privacy and regulatory concerns. Synthetic data
generation holds the promise of effectively bypassing these
boundaries allowing for simplified data accessibility and the
prospect of synthetic control cohorts. We employed two different
methodologies of generative artificial intelligence - CTAB-GAN+
and normalizing flows (NFlow) - to synthesize patient data
derived from 1606 patients with acute myeloid leukemia, a
heterogeneous hematological malignancy, that were treated within
four multicenter clinical trials. Both generative models
accurately captured distributions of demographic, laboratory,
molecular and cytogenetic variables, as well as patient outcomes
yielding high performance scores regarding fidelity and usability
of both synthetic cohorts (n = 1606 each). Survival analysis
demonstrated close resemblance of survival curves between
original and synthetic cohorts. Inter-variable relationships were
preserved in univariable outcome analysis enabling explorative
analysis in our synthetic data. Additionally, training sample
privacy is safeguarded mitigating possible patient
re-identification, which we quantified using Hamming distances.
We provide not only a proof-of-concept for synthetic data
generation in multimodal clinical data for rare diseases, but
also full public access to synthetic data sets to foster further
research.},
added-at = {2025-01-07T16:33:43.000+0100},
author = {Eckardt, Jan-Niklas and Hahn, Waldemar and R{\"o}llig, Christoph and Stasik, Sebastian and Platzbecker, Uwe and M{\"u}ller-Tidow, Carsten and Serve, Hubert and Baldus, Claudia D and Schliemann, Christoph and Sch{\"a}fer-Eckart, Kerstin and Hanoun, Maher and Kaufmann, Martin and Burchert, Andreas and Thiede, Christian and Schetelig, Johannes and Sedlmayr, Martin and Bornh{\"a}user, Martin and Wolfien, Markus and Middeke, Jan Moritz},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/2abf30651406b6b4803daa9239832f9a2/scadsfct},
interhash = {612c8ed14c52ea836d2a28d9254624cb},
intrahash = {abf30651406b6b4803daa9239832f9a2},
journal = {NPJ Digit. Med.},
keywords = {ai artificial clinical generative intelligence leukemia myeloid synthetic trials yaff},
language = {en},
month = mar,
number = 1,
pages = 76,
timestamp = {2025-07-29T10:49:33.000+0200},
title = {Mimicking clinical trials with synthetic acute myeloid leukemia patients using generative artificial intelligence},
volume = 7,
year = 2024
}