Load and Combine Data From Multiple Samples
combineSamples.RdGiven multiple data frames stored in separate files,
loadDataFromFileList()
loads and combines them into a single data frame.
combineSamples() has the same default behavior as
loadDataFromFileList(),
but possesses additional arguments that allow the data frames to be filtered,
subsetted and augmented with sample-level variables before being combined.
Usage
loadDataFromFileList(
file_list,
input_type,
data_symbols = NULL,
header, sep, read.args
)
combineSamples(
file_list,
input_type,
data_symbols = NULL,
header, sep, read.args,
seq_col = NULL,
min_seq_length = NULL,
drop_matches = NULL,
subset_cols = NULL,
sample_ids = NULL,
subject_ids = NULL,
group_ids = NULL,
verbose = FALSE
)Arguments
- file_list
A character vector of file paths, or a list containing
connectionsand file paths. Each element corresponds to a single file containing the data for a single sample.- input_type
A character string specifying the file format of the sample data files. Options are
"rds","rda","csv","csv2","tsv","table". See details.- data_symbols
Used when
input_type = "rda". Specifies the name of each sample's data frame within its respective Rdata file. Accepts a character vector of the same length asfile_list. Alternatively, a single character string can be used if all data frames have the same name.- header
For values of
input_typeother than"rds"and"rda", this argument can be used to specify a non-default value of theheaderargument toread.table(),read.csv(), etc.- sep
For values of
input_typeother than"rds"and"rda", this argument can be used to specify a non-default value of thesepargument toread.table(),read.csv(), etc.- read.args
For values of
input_typeother than"rds"and"rda", this argument can be used to specify non-default values of optional arguments toread.table(),read.csv(), etc. Accepts a named list of argument values. Values ofheaderandsepin this list take precedence over values specified via theheaderandseparguments.- seq_col
If provided, each sample's data will be filtered based on the values of
min_seq_lengthanddrop_matches. Passed tofilterInputData()for each sample.- min_seq_length
Passed to
filterInputData()for each sample.- drop_matches
Passed to
filterInputData()for each sample.- subset_cols
Passed to
filterInputData()for each sample.- sample_ids
A character or numeric vector of sample IDs, whose length matches that of
file_list.- subject_ids
An optional character or numeric vector of subject IDs, whose length matches that of
file_list. Used to assign a subject ID to each sample.- group_ids
A character or numeric vector of group IDs whose length matches that of
file_list. Used to assign each sample to a group.- verbose
Logical. If
TRUE, generates messages about the tasks performed and their progress, as well as relevant properties of intermediate outputs. Messages are sent tostderr().
Details
Each file is assumed to contain the data for a single sample, with observations indexed by row, and with the same columns across samples.
Valid options for input_type (and the corresponding function used to
load each file) include:
"rds":readRDS()"rds":readRDS()"rda":load()"csv":read.csv()"csv2":read.csv2()"tsv":read.delim()"table":read.table()
If input_type = "rda", the data_symbols argument specifies the
name of each data frame within its respective file.
When calling combineSamples(), for each of sample_ids,
subject_ids and group_ids that is non-null, a corresponding
variable will be added to the combined data frame; these variables are named
SampleID, SubjectID and GroupID.
References
Hai Yang, Jason Cham, Brian Neal, Zenghua Fan, Tao He and Li Zhang. (2023). NAIR: Network Analysis of Immune Repertoire. Frontiers in Immunology, vol. 14. doi: 10.3389/fimmu.2023.1181825
Author
Brian Neal (Brian.Neal@ucsf.edu)
Examples
# Generate example data
set.seed(42)
samples <- simulateToyData(sample_size = 5)
sample_1 <- subset(samples, SampleID == "Sample1")
sample_2 <- subset(samples, SampleID == "Sample2")
# RDS format
rdsfiles <- tempfile(c("sample1", "sample2"), fileext = ".rds")
saveRDS(sample_1, rdsfiles[1])
saveRDS(sample_2, rdsfiles[2])
loadDataFromFileList(
rdsfiles,
input_type = "rds"
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# With filtering and subsetting
combineSamples(
rdsfiles,
input_type = "rds",
seq_col = "CloneSeq",
min_seq_length = 13,
drop_matches = "GGG",
subset_cols = "CloneSeq",
sample_ids = c("id01", "id02"),
verbose = TRUE
)
#> Loading sample 1...
#> Input data contains 5 rows.
#> Removing sequences with length fewer than 13 characters...
#> Done. 4 rows remaining.
#> Removing sequences containing matches to “GGG”...
#> Done. 3 rows remaining.
#> Loading sample 2...
#> Input data contains 5 rows.
#> Removing sequences with length fewer than 13 characters...
#> Done. 2 rows remaining.
#> Removing sequences containing matches to “GGG”...
#> Done. 2 rows remaining.
#> CloneSeq SampleID
#> id01.1 TTGAGGAAATTGC id01
#> id01.2 GGAGATGAATTGG id01
#> id01.5 GAAAGAGAATCGG id01
#> id02.6 AAACACGAATTCG id02
#> id02.9 CGAGAAGAATTGC id02
# RData, different data frame names
rdafiles <- tempfile(c("sample1", "sample2"), fileext = ".rda")
save(sample_1, file = rdafiles[1])
save(sample_2, file = rdafiles[2])
loadDataFromFileList(
rdafiles,
input_type = "rda",
data_symbols = c("sample_1", "sample_2")
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# RData, same data frame names
df <- sample_1
save(df, file = rdafiles[1])
df <- sample_2
save(df, file = rdafiles[2])
loadDataFromFileList(
rdafiles,
input_type = "rda",
data_symbols = "df"
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# comma-separated values with header row; row names in first column
csvfiles <- tempfile(c("sample1", "sample2"), fileext = ".csv")
utils::write.csv(sample_1, csvfiles[1], row.names = TRUE)
utils::write.csv(sample_2, csvfiles[2], row.names = TRUE)
loadDataFromFileList(
csvfiles,
input_type = "csv",
read.args = list(row.names = 1)
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# semicolon-separated values with decimals as commas;
# header row, row names in first column
utils::write.csv2(sample_1, csvfiles[1], row.names = TRUE)
utils::write.csv2(sample_2, csvfiles[2], row.names = TRUE)
loadDataFromFileList(
csvfiles,
input_type = "csv2",
read.args = list(row.names = 1)
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# tab-separated values with header row and decimals as commas
tsvfiles <- tempfile(c("sample1", "sample2"), fileext = ".tsv")
utils::write.table(sample_1, tsvfiles[1], sep = "\t", dec = ",")
utils::write.table(sample_2, tsvfiles[2], sep = "\t", dec = ",")
loadDataFromFileList(
tsvfiles,
input_type = "tsv",
header = TRUE,
read.args = list(dec = ",")
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# space-separated values with header row and NAs encoded as as "No Value"
txtfiles <- tempfile(c("sample1", "sample2"), fileext = ".txt")
utils::write.table(sample_1, txtfiles[1], na = "No Value")
utils::write.table(sample_2, txtfiles[2], na = "No Value")
loadDataFromFileList(
txtfiles,
input_type = "table",
read.args = list(
header = TRUE,
na.strings = "No Value"
)
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# custom value separator and row names in first column
utils::write.table(sample_1, txtfiles[1],
sep = "@", row.names = TRUE, col.names = FALSE
)
utils::write.table(sample_2, txtfiles[2],
sep = "@", row.names = TRUE, col.names = FALSE
)
loadDataFromFileList(
txtfiles,
input_type = "table",
sep = "@",
read.args = list(
row.names = 1,
col.names = c("rownames",
"CloneSeq", "CloneFrequency",
"CloneCount", "SampleID"
)
)
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2
# same as previous example
# (value of sep in read.args overrides value in sep argument)
loadDataFromFileList(
txtfiles,
input_type = "table",
sep = "\t",
read.args = list(
sep = "@",
row.names = 1,
col.names = c("rownames",
"CloneSeq", "CloneFrequency",
"CloneCount", "SampleID"
)
)
)
#> CloneSeq CloneFrequency CloneCount SampleID
#> file1.1 TTGAGGAAATTGC 0.1064411838 428 Sample1
#> file1.2 GGAGATGAATTGG 0.2467047998 992 Sample1
#> file1.3 GTCGGGTAATTGG 0.1178811241 474 Sample1
#> file1.4 GCCGGGTAATTC 0.4794827157 1928 Sample1
#> file1.5 GAAAGAGAATCGG 0.0494901766 199 Sample1
#> file2.6 AAACACGAATTCG 0.3801916933 1428 Sample2
#> file2.7 ACAAAAGAATTC 0.0002662407 1 Sample2
#> file2.8 AGGAAAGAATTG 0.1589456869 597 Sample2
#> file2.9 CGAGAAGAATTGC 0.0878594249 330 Sample2
#> file2.10 GAAAAAAAATTC 0.3727369542 1400 Sample2