Create multivariate (correlated) data - for general distributions

addCorGen(
  dtOld,
  nvars,
  idvar = "id",
  rho,
  corstr,
  corMatrix = NULL,
  dist,
  param1,
  param2 = NULL,
  cnames = NULL,
  method = "copula",
  formSpec = NULL,
  periodvar = "period"
)

Arguments

dtOld

If an existing data.table is specified, then wide will be set to TRUE and n will be set to the nrow(dt) without any warning or error.

nvars

Number of new variables to create for each id.

idvar

String variable name of column represents individual level id for correlated data.

rho

Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not provided.

corstr

Correlation structure of the variance-covariance matrix defined by sigma and rho. Options include "cs" for a compound symmetry structure and "ar1" for an autoregressive structure.

corMatrix

Correlation matrix can be entered directly. It must be symmetrical and positive semi-definite. It is not a required field; if a matrix is not provided, then a structure and correlation coefficient rho must be specified.

dist

A string indicating "normal", "binary", "poisson" or "gamma".

param1

A string that represents the column in dtOld that contains the parameter for the mean of the distribution. In the case of the uniform distribution the column specifies the minimum.

param2

A string that represents the column in dtOld that contains a possible second parameter for the distribution. For the normal distribution, this will be the variance; for the gamma distribution, this will be the dispersion; and for the uniform distribution, this will be the maximum.

cnames

Explicit column names. A single string with names separated by commas. If no string is provided, the default names will be V#, where # represents the column.

method

Two methods are available to generate correlated data. (1) "copula" uses the multivariate Gaussian copula method that is applied to all other distributions; this applies to all available distributions. (2) "ep" uses an algorithm developed by Emrich and Piedmonte (1991).

formSpec

The formula (as a string) that was used to generate the binary outcome in the `defDataAdd` statement. This is only necessary when method "ep" is requested.

periodvar

A string value that indicates the name of the field that indexes the repeated measurement for an individual unit. The value defaults to "period".

Value

Original data.table with added column(s) of correlated data

References

Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional Multivariate Binary Variates. The American Statistician 1991;45:302-4.

Examples

# Wide example def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") def <- defData(def, varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log") def <- defData(def, varname = "p", formula = "-2 + .3*xbase", dist = "nonrandom", link = "logit") dt <- genData(500, def) dtX1 <- addCorGen( dtOld = dt, idvar = "cid", nvars = 3, rho = .7, corstr = "cs", dist = "poisson", param1 = "lambda" ) dtX2 <- addCorGen( dtOld = dt, idvar = "cid", nvars = 4, rho = .4, corstr = "ar1", dist = "binary", param1 = "p" ) # Long example def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") def <- defData(def, "nperiods", formula = 3, dist = "noZeroPoisson") def2 <- defDataAdd( varname = "lambda", formula = ".5+.5*period + .1*xbase", dist = "nonrandom", link = "log" ) def2 <- defDataAdd(def2, varname = "p", formula = "-3+.2*period + .3*xbase", dist = "nonrandom", link = "logit" ) def2 <- defDataAdd(def2, varname = "gammaMu", formula = ".2*period + .3*xbase", dist = "nonrandom", link = "log" ) def2 <- defDataAdd(def2, varname = "gammaDis", formula = 1, dist = "nonrandom" ) def2 <- defDataAdd(def2, varname = "normMu", formula = "5+period + .5*xbase", dist = "nonrandom" ) def2 <- defDataAdd(def2, varname = "normVar", formula = 4, dist = "nonrandom" ) def2 <- defDataAdd(def2, varname = "unifMin", formula = "5 + 2*period + .2*xbase", dist = "nonrandom" ) def2 <- defDataAdd(def2, varname = "unifMax", formula = "unifMin + 20", dist = "nonrandom" ) dt <- genData(1000, def) dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3) dtLong <- addColumns(def2, dtLong) # Poisson distribution dtX3 <- addCorGen( dtOld = dtLong, idvar = "cid", nvars = 3, rho = .6, corstr = "cs", dist = "poisson", param1 = "lambda", cnames = "NewPois" ) dtX3
#> cid period xbase nperiods timeID lambda p gammaMu #> 1: 1 0 2.033135 4 1 2.020436 0.08393386 1.840322 #> 2: 1 1 2.033135 4 2 3.331136 0.10064672 2.247774 #> 3: 1 2 2.033135 4 3 5.492116 0.12025058 2.745438 #> 4: 2 0 12.657813 5 4 5.846138 0.68940602 44.582609 #> 5: 2 1 12.657813 5 5 9.638652 0.73053603 54.453321 #> --- #> 2996: 999 1 2.023406 2 2996 3.327897 0.10038284 2.241224 #> 2997: 999 2 2.023406 2 2997 5.486775 0.11994215 2.737437 #> 2998: 1000 0 10.464250 1 2998 4.694657 0.53476256 23.087121 #> 2999: 1000 1 10.464250 1 2999 7.740180 0.58401438 28.198673 #> 3000: 1000 2 10.464250 1 3000 12.761400 0.63164373 34.441937 #> gammaDis normMu normVar unifMin unifMax NewPois #> 1: 1 6.016568 4 5.406627 25.40663 2 #> 2: 1 7.016568 4 7.406627 27.40663 4 #> 3: 1 8.016568 4 9.406627 29.40663 7 #> 4: 1 11.328906 4 7.531563 27.53156 5 #> 5: 1 12.328906 4 9.531563 29.53156 8 #> --- #> 2996: 1 7.011703 4 7.404681 27.40468 3 #> 2997: 1 8.011703 4 9.404681 29.40468 3 #> 2998: 1 10.232125 4 7.092850 27.09285 3 #> 2999: 1 11.232125 4 9.092850 29.09285 7 #> 3000: 1 12.232125 4 11.092850 31.09285 11
# Binomial distribution - copula method dtX4 <- addCorGen( dtOld = dtLong, idvar = "cid", nvars = 3, rho = .6, corstr = "cs", dist = "binary", param1 = "p", cnames = "NewBin" ) dtX4
#> cid period xbase nperiods timeID lambda p gammaMu #> 1: 1 0 2.033135 4 1 2.020436 0.08393386 1.840322 #> 2: 1 1 2.033135 4 2 3.331136 0.10064672 2.247774 #> 3: 1 2 2.033135 4 3 5.492116 0.12025058 2.745438 #> 4: 2 0 12.657813 5 4 5.846138 0.68940602 44.582609 #> 5: 2 1 12.657813 5 5 9.638652 0.73053603 54.453321 #> --- #> 2996: 999 1 2.023406 2 2996 3.327897 0.10038284 2.241224 #> 2997: 999 2 2.023406 2 2997 5.486775 0.11994215 2.737437 #> 2998: 1000 0 10.464250 1 2998 4.694657 0.53476256 23.087121 #> 2999: 1000 1 10.464250 1 2999 7.740180 0.58401438 28.198673 #> 3000: 1000 2 10.464250 1 3000 12.761400 0.63164373 34.441937 #> gammaDis normMu normVar unifMin unifMax NewBin #> 1: 1 6.016568 4 5.406627 25.40663 0 #> 2: 1 7.016568 4 7.406627 27.40663 0 #> 3: 1 8.016568 4 9.406627 29.40663 0 #> 4: 1 11.328906 4 7.531563 27.53156 0 #> 5: 1 12.328906 4 9.531563 29.53156 1 #> --- #> 2996: 1 7.011703 4 7.404681 27.40468 0 #> 2997: 1 8.011703 4 9.404681 29.40468 0 #> 2998: 1 10.232125 4 7.092850 27.09285 0 #> 2999: 1 11.232125 4 9.092850 29.09285 0 #> 3000: 1 12.232125 4 11.092850 31.09285 0
# Gamma distribution dtX6 <- addCorGen( dtOld = dtLong, idvar = "cid", nvars = 3, rho = .6, corstr = "ar1", dist = "gamma", param1 = "gammaMu", param2 = "gammaDis", cnames = "NewGamma" ) dtX6
#> cid period xbase nperiods timeID lambda p gammaMu #> 1: 1 0 2.033135 4 1 2.020436 0.08393386 1.840322 #> 2: 1 1 2.033135 4 2 3.331136 0.10064672 2.247774 #> 3: 1 2 2.033135 4 3 5.492116 0.12025058 2.745438 #> 4: 2 0 12.657813 5 4 5.846138 0.68940602 44.582609 #> 5: 2 1 12.657813 5 5 9.638652 0.73053603 54.453321 #> --- #> 2996: 999 1 2.023406 2 2996 3.327897 0.10038284 2.241224 #> 2997: 999 2 2.023406 2 2997 5.486775 0.11994215 2.737437 #> 2998: 1000 0 10.464250 1 2998 4.694657 0.53476256 23.087121 #> 2999: 1000 1 10.464250 1 2999 7.740180 0.58401438 28.198673 #> 3000: 1000 2 10.464250 1 3000 12.761400 0.63164373 34.441937 #> gammaDis normMu normVar unifMin unifMax NewGamma #> 1: 1 6.016568 4 5.406627 25.40663 1.7635282 #> 2: 1 7.016568 4 7.406627 27.40663 2.6140461 #> 3: 1 8.016568 4 9.406627 29.40663 1.7093672 #> 4: 1 11.328906 4 7.531563 27.53156 16.9854522 #> 5: 1 12.328906 4 9.531563 29.53156 50.6781317 #> --- #> 2996: 1 7.011703 4 7.404681 27.40468 0.1006445 #> 2997: 1 8.011703 4 9.404681 29.40468 0.9576648 #> 2998: 1 10.232125 4 7.092850 27.09285 5.0278293 #> 2999: 1 11.232125 4 9.092850 29.09285 2.2798681 #> 3000: 1 12.232125 4 11.092850 31.09285 13.0286195
# Normal distribution dtX7 <- addCorGen( dtOld = dtLong, idvar = "cid", nvars = 3, rho = .6, corstr = "ar1", dist = "normal", param1 = "normMu", param2 = "normVar", cnames = "NewNorm" ) # Binary outcome - ep method probform <- "-2 + .3*period" def1 <- defDataAdd( varname = "p", formula = probform, dist = "nonrandom", link = "logit" ) dx <- genData(100) dx <- addPeriods(dx, nPeriods = 4) dx <- addColumns(def1, dx) dg <- addCorGen(dx, nvars = 4, corMatrix = NULL, rho = .3, corstr = "cs", dist = "binary", param1 = "p", method = "ep", formSpec = probform, periodvar = "period" )