cran · PaulaAlessio · Mar 24, 2017 · Mar 27, 2017 · Mar 27, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+pcalg-Ex.R
+src/*.o 
+src/*.so 
+src/symbols.rds
+*~ 
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -105,6 +105,7 @@ export(trueCov,
        binCItest,
        ida,
        idaFast,
+       idaFastOpt,
        legal.path,
        plotSG,
        pc.cons.intern,

diff --git a/R/pcalg.R b/R/pcalg.R
@@ -3207,6 +3207,35 @@ idaFast <- function(x.pos, y.pos.set, mcov, graphEst)
   beta.hat
 }
 
+  ## Purpose: Estimate the causal effect of x.pos.set on each element in the
+  ## graph using the local method; graphEst and correlation matrix
+  ## have to be precomputed; orient
+  ## undirected edges at x in a way so that no new collider is
+  ## introduced; if there is an undirected edge between x and y, both directions are considered;
+  ## i.e., y might be partent of x in which case the effect is 0.
+  ## ----------------------------------------------------------------------
+  ## Arguments:
+  ## - x.pos.set: 
+  ## - mcov: Covariance matrix that was used to estimate graphEst
+  ## - graphEst: Fit of PC Algorithm (semidirected)
+  ## ----------------------------------------------------------------------
+  ## Value: list of causal values; one list element (matrix) for each element of
+  ## x.pos.set 
+  ## ----------------------------------------------------------------------
+  ## Author: Paula Perez, Date: 17 Jan 2017
+
+idaFastOpt <- function(x.pos.set,mcov,graphEst)
+{
+     if(missing(x.pos.set)){
+       x.pos.set = 1:ncol(mcov)
+     }
+     adjmat <- as.matrix(wgtMatrix(graphEst))
+     .Call("idaFastC",(x.pos.set - 1) ,mcov,adjmat)
+}
+
+
+
+
 ## -> ../man/legal.path.Rd
 ## only called in  qreach()  with only 'c' varying
 legal.path <- function(a,b,c, amat)

diff --git a/README.md b/README.md
@@ -0,0 +1,94 @@
+# Pcalg fork
+
+## Description 
+
+ This is a fork from the `R` package `pcalg` 
+that implements a `C++` optimization of the 
+function idaFast, idaFastOpt. Using `C++` together
+with the `Armadillo` library instead of `R` improves the 
+performance already. On top of that, the new implementation
+seems to have reduced the complexity of the algorithm (see 
+Details and Benchmarks). 
+
+* **Patch file**:  `dataOpt/patch_pcalg.txt`
+
+
+### Details
+
+Given a *completed partially directed acyclic graph* `pdag`, and
+a given node `x`, we identify the unique parents `uniq_pa`  and the 
+ambiguous parents `amb_pa`. From the set of `amb_pa`, we direct them 
+in all possible ways (`2`<sup>`|amb_pa|`</sup>) checking first that 
+they introduce no new colliders. This implementation differs with 
+the original implementation in how "all possible ways" are examined. 
+
+* **Original code:** A loop from 1 to the number of elements
+in `amb_pa`, `|amb_pa|` is executed, and the function,
+
+   ``pa.tmp <- combn(amb_pa,i,simplify = TRUE)``
+
+   is called where `i` is the variable in the loop. All sets of size `i` in `amb_pa`
+are obtained. For every set, all parents in it are directed towards 
+`x`, the ones not in it  outwards. If the configuration does not introduce a new 
+collider, the causal effects are computed, nothing is done otherwise. 
+
+* **Optimization:** A loop from 0 until  `2`<sup>`|amb_pa| - 1`</sup> is executed, 
+where binary representation of `i` (loop variable, `long int`) is sweeping 
+over the half posibilities. For every `i` we create two subsets 
+`pa_1` and `pa_2` complementary to each other. `pa_1` (`pa_2`) contains the `jth` element
+of `amb_pa` if the `jth` bit of `i` is 1 (0).  In that way, we have covered all 
+possibilities if we first set `pa_1` to be directed towards `x`, `pa_2` outwards, 
+check for colliders, compute causal effects and then do the same reversing the roles
+of `pa_1` and `pa_2.`
+
+
+
+### Benchmarks 
+
+ Comparison of performance of  `idaFast` (original function of the package)
+versus our implementation, `idaFastOpt`. We have used  real data and created **cpdag** of
+100,200,500,1000,5000 nodes. `idaFast` and `idaFastOpt` have then been run on these
+graphs on a Intel(R) Xeon(R) CPU E5-4620 0 @ 2.20GHz. In the graph below, we can see 
+the performance of the two functions, in a log-log scale. 
+
+We see not only that the runtime is significantly reduced, but also, and 
+for the **cpdag** we were testing the functions on, `idaFast` seemed to scale like 
+`O(n`<sup>`3`</sup>`)` whereas `idaFastOpt` like `O(n`<sup>`2`</sup>`)`. Note that 
+the scaling  is not generalizable to all types of graphs. For different input graphs, 
+one can expect a different complexity. Nevertheless, it seems that the new implementation
+has reduced it.  
+
+
+<center> <img src="dataOpt/timings_log.png" /> </center>
+
+## Usage
+
+``idaFastOpt(x.pos.set = 1:N, mcov = cov(subDat), graphEst = l$cpdag_graph)``
+
+
+* **Arguments:** 
+   - `x.pos.set`: (integer vector) optional argument. 
+   Nodes on which the causal effects are going to 
+   be computed. If not passed, the function computes 
+   the causal effects on all nodes. Note a vector of positions
+   of the target variables is not passed as an argument, in contrast 
+   to `idaFast` since in this function all nodes are going to be 
+   targets.
+   - `mcov`: covariance matrix that was used to estimate `graphEst`
+   - `graphEst`: estimated **cpdag** from the function `pc`. If the output of
+   `pc` is `pc.fit`, then the estimated **cpdag** can be obtained by
+   `pc.fit@graph`.
+
+* **Output:**
+   list of causal values; one list element (matrix) for each element of 
+   `x.pos.set`
+
+## Tests 
+
+  In the folder `dataOpt`, one can find a code example in `run_idaFast.R`, 
+  where both functions, `idaFast`, `idaFastOpt`, are run for the **cpdag**'s
+  contained in `dataOpt/skeletonfiles`, and a profiling is performed. 
+
+
+  In the same folder, the script `benchmarks.R` creates the plot shown in this README.md 
+  file.
diff --git a/dataOpt/01-005-00_Ergebnis_Uebersicht.csv b/dataOpt/01-005-00_Ergebnis_Uebersicht.csv
@@ -0,0 +1,101 @@
+Sample	Batch	CD40L	BCR	IGF	CpG	IL10	Myc	Number	Viability	S	SubG1	size	total RNA
+1	1	1	0	0	0	0	1	8,4	94,9	47,6	2,7	604	343,63
+2	1	0	1	0	0	0	0	5,4	94,2	12,3	10,2	462	106,82
+3	1	0	0	0	0	0	0	5,6	93,0	4,4	3,9	433	133,7
+4	1	0	0	0	0	0	1	8,4	94,6	43,0	4,0	577	226,45
+5	1	0	1	1	1	1	1	8,0	97,4	63,0	2,8	638	356,71
+6	1	1	1	1	1	1	1	8,8	96,4	59,2	2,3	656	383,64
+7	1	1	1	0	1	0	1	7,7	94,5	54,2	2,7	655	399,47
+8	1	1	0	0	1	1	0	8,3	95,2	45,3	1,9	586	188,31
+9	1	0,2	1	1	0	1	0	6,8	92,3	24,4	2,2	557	164,12
+10	1	0	0	0	1	1	0	6,3	95,9	41,4	2,2	574	225,71
+11	2	0	1	0	0	0	1	8,4	93,9	42,4	5,3	543	311,88
+12	2	0	0	1	0	0	0	6,0	93,9	7,0	5,7	479	103,66
+13	2	0	0	0	0	0	0	5,6	92,1	6,2	6,8	461	131,53
+14	2	0	0	0	0	0	1	8,6	95,0	37,0	5,4	550	277,7
+15	2	0	1	0,2	0,2	0	1	8,5	95,9	42,2	4,1	528	364,36
+16	2	1	0,2	0,2	0	0	1	8,0	95,3	38,5	5,9	497	341,34
+17	2	1	1	1	1	0,2	1	8,6	96,3	57,9	4,1	564	420,51
+18	2	0,2	0,2	0,2	0,2	0,2	0	6,5	95,6	28,0	6,6	526	178,34
+19	2	1	0,2	0	0	1	0	5,9	92,0	20,3	4,8	541	185,92
+20	2	0,2	1	0,2	0	0,2	0	5,6	93,5	16,5	9,0	509	157,58
+21	3	0	0	1	0	0	1	9,4	93,1	37,9	4,2	512	246,07
+22	3	0	0	0	1	0	0	6,2	95,4	24,9	2,2	594	176,59
+23	3	0	0	0	0	0	0	6,2	92,8	4,9	2,5	515	135,29
+24	3	0	0	0	0	0	1	8,3	92,9	36,6	3,7	612	232,25
+25	3	0,2	0,2	0,2	0	0	1	8,9	95,2	38,1	3,2	707	300,09
+26	3	0,2	1	0	0	1	1	7,8	91,6	43,5	1,9	658	306,57
+27	3	1	0	0	1	1	1	8,4	94,3	54,9	2,3	714	362,05
+28	3	0,2	1	0,2	1	0	0	6,2	92,3	38,2	1,4	638	213,57
+29	3	0	0	0	0,2	1	0	6,4	91,8	28,7	1,5	582	187,55
+30	3	0,2	1	0,2	0,2	0,2	0	6,1	94,0	27,5	2,2	624	189,19
+31	4b	0	0	0	1	0	1	9,6	93,3	42,5	4,0	595	388,83
+32	4b	0	0	0	0	1	0	6,8	93,0	12,8	7,3	473	143,68
+33	4b	0	0	0	0	0	0	6,7	93,7	5,8	7,4	438	129,82
+34	4b	0	0	0	0	0	1	8,0	93,1	29,4	5,2	559	260,61
+35	4b	0,2	1	0,2	0,2	0,2	1	8,7	92,1	37,4	4,1	585	342,67
+36	4b	1	1	0,2	0,2	0	1	7,7	92,7	38,6	4,7	593	316,21
+37	4b	0,2	0	1	0,2	0	1	7,8	91,9	36,4	4,7	561	307,76
+38	4b	1	1	0,2	0,2	0	0	6,6	94,0	20,9	6,1	523	176,84
+39	4b	0,2	1	1	0,2	0	0	6,6	93,2	23,2	7,4	504	183,84
+40	4b	1	0	1	1	0,2	0	7,5	93,3	40,2	3,9	531	226,45
+41	5	0	0	0	0	1	1	8,2	93,5	38,0	5,0	488	300,33
+42	5	1	0	0	0	0	0	6,2	92,8	9,9	4,1	503	132,34
+43	5	0	0	0	0	0	0	6,0	93,0	4,8	4,6	498	128,24
+44	5	0	0	0	0	0	1	7,9	92,6	34,7	5,7	577	244,87
+45	5	0	0	0	1	1	1	9,7	92,3	49,0	3,7	535	269,28
+46	5	0,2	0,2	0	1	1	1	9,9	94,6	53,2	3,5	556	253,46
+47	5	0,2	0	0,2	1	0	1	9,2	93,0	40,7	3,3	582	262,22
+48	5	0,2	0,2	0,2	0	0	0	6,4	94,3	9,1	4,0	517	135,09
+49	5	0,2	1	0	0	1	0	6,5	93,6	16,9	3,7	534	151,39
+50	5	0,2	0	0,2	1	0	0	7,0	93,6	23,0	3,6	561	141,69
+51	6	1	0	0	0	0	1	8,1	96,4	37,8	2,9	555	283,87
+52	6	0	1	0	0	0	0	6,2	94,5	10,3	5,6	480	143,37
+53	6	0	0	0	0	0	0	6,6	95,2	4,3	4,2	449	130,24
+54	6	0	0	0	0	0	1	7,8	93,9	32,2	3,8	563	272,4
+55	6	0	0	0	0,2	1	1	9,5	94,3	49,6	3,1	579	319,45
+56	6	0,2	0,2	0,2	1	1	1	8,7	95,4	54,7	2,7	599	318,45
+57	6	0,2	1	1	0	0	1	9,4	93,2	36,8	4,1	547	241,83
+58	6	1	1	0	1	1	0	8,0	96,4	44,4	2,3	549	192,91
+59	6	1	1	1	1	1	0	7,4	93,2	45,5	2,1	572	225,32
+60	6	0,2	0	1	0,2	1	0	7,3	93,8	36,5	3,0	537	169,41
+61	7	0	1	0	0	0	1	9,0	92,3	31,2	3,1	564	283,97
+62	7	0	0	1	0	0	0	6,2	95,4	3,5	1,8	560	122,47
+63	7	0	0	0	0	0	0	6,2	93,2	3,5	2,9	555	143,99
+64	7	0	0	0	0	0	1	7,7	93,8	26,4	6,0	498	256,14
+65	7	0,2	1	1	0	1	1	10,3	91,5	31,2	4,0	582	223,09
+66	7	1	0,2	0	0	1	1	9,6	93,8	29,7	4,8	591	219,84
+67	7	1	1	0	1	1	1	8,4	93,9	51,6	1,9	610	323,49
+68	7	0	0,2	0	0,2	1	0	6,8	93,4	21,7	1,5	677	153,95
+69	7	1	1	1	1	0,2	0	7,2	95,6	34,1	1,5	662	247,39
+70	7	1	1	0	1	0	0	6,7	94,5	32,4	0,6	642	162,63
+71	8	0	0	1	0	0	1	8,9	94,6	43,5	2,9	623	230,06
+72	8	0	0	0	1	0	0	6,0	93,0	28,4	4,3	564	160,84
+73	8	0	0	0	0	0	0	6,0	94,4	7,7	6,5	504	139,93
+74	8	0	0	0	0	0	1	9,6	97,0	39,1	5,2	610	270,27
+75	8	0	0,2	0	0,2	1	1	9,8	97,0	43,6	4,4	657	295,49
+76	8	0,2	0,2	0,2	0,2	0,2	1	9,4	98,1	43,2	4,0	644	289,11
+77	8	0,2	1	0,2	0	0,2	1	9,0	95,8	46,3	4,5	651	316,74
+78	8	0	1	1	1	1	0	7,0	97,5	42,2	4,9	607	219,33
+79	8	0,2	0,2	0	1	1	0	7,5	96,2	39,0	4,4	633	205,68
+80	8	0,2	0,2	0,2	1	1	0	7,2	95,2	42,7	4,0	623	246,05
+81	9	0	0	0	1	0	1	8,4	94,0	39,1	6,9	611	347,9
+82	9	0	0	0	0	1	0	5,7	95,0	12,7	4,3	514	159,7
+83	9	0	0	0	0	0	0	6,3	95,9	5,0	6,1	476	144
+84	9	0	0	0	0	0	1	7,9	92,3	32,2	5,4	578	271,96
+85	9	0,2	1	1	0,2	0	1	8,8	95,1	29,5	5,4	621	315,52
+86	9	0,2	0,2	1	0	0,2	1	8,7	93,5	35,2	4,6	600	293,6
+87	9	1	0	1	1	0,2	1	8,4	95,5	46,4	4,4	644	319,79
+88	9	0	1	0,2	0,2	0	0	5,7	95,5	27,7	6,2	521	150,34
+89	9	0,2	0,2	1	0	0,2	0	6,8	94,5	17,2	4,2	538	141,63
+90	9	1	0	1	0,2	0	0	6,6	95,3	28,0	3,7	555	155,49
+91	10	0	0	0	0	1	1	8,7	93,3	34,6	7,1	661	254,27
+92	10	1	0	0	0	0	0	6,3	95,5	7,7	4,7	539	129,75
+93	10	0	0	0	0	0	0	6,2	94,5	3,9	8,8	530	142,34
+94	10	0	0	0	0	0	1	8,5	91,9	29,9	7,6	618	217,24
+95	10	1	0	1	0,2	0	1	9,0	93,4	42,0	3,6	686	256,26
+96	10	0,2	1	0,2	1	0	1	8,1	95,7	48,0	4,2	743	295,8
+97	10	0,2	0	1	0,2	1	1	9,1	94,1	47,7	3,5	721	294,99
+98	10	0,2	1	1	0	0	0	6,2	92,8	14,2	9,0	545	185,25
+99	10	0,2	0	1	0,2	0	0	6,5	95,1	25,4	5,0	560	157,29
+100	10	1	0,2	0,2	0	0	0	6,3	92,1	10,3	5,6	540	139,9
diff --git a/dataOpt/benchmarks.R b/dataOpt/benchmarks.R
@@ -0,0 +1,51 @@
+# This script creates a graph with the timings of idaFast
+timings <-  read.csv("timings.csv", stringsAsFactors=FALSE)
+
+
+fast <- timings[which(timings$optimized==TRUE & timings$Myc == "Low"), ]
+slow <- timings[which(timings$optimized==FALSE & timings$Myc == "Low"), ]
+
+
+
+png("timings.png", width=400, height=400, units ="px")
+
+r_slow <- lm(log(slow[,2]) ~ log(slow[,1]) )
+r_fast <- lm(log(fast[,2]) ~ log(fast[,1]) )
+
+
+plot(slow[,1:2],col="red", xlab = "Anzahl von Genen", 
+   ylab = " Laufzeit (Sekunden)", pch=16, cex=1.4, xaxt= "n")
+points(fast[,1:2],col="blue",pch= 8, lwd=2)
+
+axis(1,slow[,1],slow[,1])
+legend("topleft",legend = c("urspruengliche Funktion",
+       "optimierte Funktion"), pch = c(16,8), 
+       pt.cex = c(1.4,1), pt.lwd=c(1,2), col = c("red","blue"))
+       #lwd= c(1,2), cex=c(1.4,1),col = c("red","blue"))
+
+mtext("200", at =c(250), line = -20.55 )
+
+dev.off()
+
+png("timings_log.png", width=400, height=400, units ="px")
+
+x <- c(1:5000)
+y <- x^(r_slow$coefficients[2])*exp(r_slow$coefficients[1])
+plot(slow[,1:2],col="red", xlab = "Number of genes, n", 
+   ylab = " Time, t (sec)", ylim = c(0.01,50000), pch=16, cex=1.4, yaxt= "n", log = "xy")
+lines(x,y,lty=2, col="lightgray",lwd=2)
+x <- c(1:5000)
+y <- x^(r_fast$coefficients[2])*exp(r_fast$coefficients[1])
+lines(x,y,lty=2, col="lightgray",lwd=2)
+points(fast[,1:2],col="blue",pch= 8, lwd=2)
+axis(2,c(0.01,0.1,1,10,100,1000,10000),c("0.01","0.1","1","10","100","1000","10000"), las = 2)
+legend("topleft",legend = c("idaFast",
+       "idaFastOpt"), pch = c(16,8), 
+       pt.cex = c(1.4,1), pt.lwd=c(1,2), col = c("red","blue"))
+       #lwd= c(1,2), cex=c(1.4,1),col = c("red","blue"))
+slope <- sprintf("%.02f", r_slow$coefficients[2])
+text(200,10,bquote( 't ~ '~ O(n^.(slope)) ),srt=38)
+slope <- sprintf("%.02f", r_fast$coefficients[2])
+text(200,0.17,bquote( 't ~ '~ O(n^.(slope)) ), srt=25)
+
+dev.off()