diff --git a/.~lock.Formatted_Results.ods# b/.~lock.Formatted_Results.ods# index 8fd9f05..73d0e93 100644 --- a/.~lock.Formatted_Results.ods# +++ b/.~lock.Formatted_Results.ods# @@ -1 +1 @@ -,noah,NovaArchSys,02.05.2022 16:43,file:///home/noah/.config/libreoffice/4; \ No newline at end of file +,noah,NovaArchSys,02.05.2022 17:47,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Formatted_Results.ods b/Formatted_Results.ods index 238d471..545ee45 100644 Binary files a/Formatted_Results.ods and b/Formatted_Results.ods differ diff --git a/Report/Bibliography.bib b/Report/Bibliography.bib index 263fe1a..063a7a1 100644 --- a/Report/Bibliography.bib +++ b/Report/Bibliography.bib @@ -1333,4 +1333,27 @@ of 27}, Pages = {80}, Abstract = {<h4>Background</h4>Numerous centrality measures have been introduced to identify "central" nodes in large networks. The availability of a wide range of measures for ranking influential nodes leaves the user to decide which measure may best suit the analysis of a given network. The choice of a suitable measure is furthermore complicated by the impact of the network topology on ranking influential nodes by centrality measures. To approach this problem systematically, we examined the centrality profile of nodes of yeast protein-protein interaction networks (PPINs) in order to detect which centrality measure is succeeding in predicting influential proteins. We studied how different topological network features are reflected in a large set of commonly used centrality measures.<h4>Results</h4>We used yeast PPINs to compare 27 common of centrality measures. The measures characterize and assort influential nodes of the networks. We applied principal component analysis (PCA) and hierarchical clustering and found that the most informative measures depend on the network's topology. Interestingly, some measures had a high level of contribution in comparison to others in all PPINs, namely Latora closeness, Decay, Lin, Freeman closeness, Diffusion, Residual closeness and Average distance centralities.<h4>Conclusions</h4>The choice of a suitable set of centrality measures is crucial for inferring important functional properties of a network. We concluded that undertaking data reduction using unsupervised machine learning methods helps to choose appropriate variables (centrality measures). Hence, we proposed identifying the contribution proportions of the centrality measures with PCA as a prerequisite step of network analysis before inferring functional consequences, e.g., essentiality of a node.}, URL = {https://europepmc.org/articles/PMC6069823}, +} + +@Article{Katz, + author={Leo Katz}, + title={{A new status index derived from sociometric analysis}}, + journal={Psychometrika}, + year=1953, + volume={18}, + number={1}, + pages={39-43}, + month={March}, + keywords={}, + doi={10.1007/BF02289026}, + abstract={No abstract is available for this item.}, + url={https://ideas.repec.org/a/spr/psycho/v18y1953i1p39-43.html} +} + +@article{ModKatz, + title={Katz centrality of Markovian temporal networks: Analysis and optimization}, + author={Masaki Ogura and Victor M. Preciado}, + journal={2017 American Control Conference (ACC)}, + year={2017}, + pages={5001-5006} } \ No newline at end of file diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.aux b/Report/Schrick-Noah_CS-7863_Final-Report.aux index 924ffd1..8b5df99 100644 --- a/Report/Schrick-Noah_CS-7863_Final-Report.aux +++ b/Report/Schrick-Noah_CS-7863_Final-Report.aux @@ -27,9 +27,12 @@ \@writefile{toc}{\contentsline {section}{\numberline {3}Experimental Networks}{4}{}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {4}Centralities}{4}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Introduction}{4}{}\protected@file@percent } +\citation{Katz} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Degree}{5}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Betweenness}{6}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Betweenness}{5}{}\protected@file@percent } +\newlabel{eq:between}{{1}{5}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Katz}{6}{}\protected@file@percent } +\newlabel{eq:Katz}{{2}{6}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5}K-Path Edge}{6}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Adapted Page Rank}{6}{}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {5}Transitive Closure}{6}{}\protected@file@percent } @@ -53,6 +56,7 @@ \bibcite{noauthor_health_1996}{10} \bibcite{PCI}{11} \bibcite{PMID:30064421}{12} +\bibcite{Katz}{13} \bibstyle{ieeetr} \@writefile{toc}{\contentsline {section}{Bibliography}{7}{}\protected@file@percent } \gdef \@abspage@last{7} diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.bbl b/Report/Schrick-Noah_CS-7863_Final-Report.bbl index 482a21e..b97e4ee 100644 --- a/Report/Schrick-Noah_CS-7863_Final-Report.bbl +++ b/Report/Schrick-Noah_CS-7863_Final-Report.bbl @@ -58,4 +58,8 @@ M.~Ashtiani, A.~Salehzadeh-Yazdi, Z.~Razaghi-Moghadam, H.~Hennig, centrality measures for protein-protein interaction networks,'' {\em BMC systems biology}, vol.~12, p.~80, July 2018. +\bibitem{Katz} +L.~Katz, ``{A new status index derived from sociometric analysis},'' {\em + Psychometrika}, vol.~18, pp.~39--43, March 1953. + \end{thebibliography} diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.blg b/Report/Schrick-Noah_CS-7863_Final-Report.blg index 830309f..1515a6b 100644 --- a/Report/Schrick-Noah_CS-7863_Final-Report.blg +++ b/Report/Schrick-Noah_CS-7863_Final-Report.blg @@ -4,45 +4,45 @@ The top-level auxiliary file: Schrick-Noah_CS-7863_Final-Report.aux The style file: ieeetr.bst Database file #1: Bibliography.bib Warning--empty booktitle in Mieghem2018DirectedGA -You've used 12 entries, +You've used 13 entries, 1876 wiz_defined-function locations, - 549 strings with 5647 characters, -and the built_in function-call counts, 2222 in all, are: -= -- 211 -> -- 83 + 558 strings with 5746 characters, +and the built_in function-call counts, 2480 in all, are: += -- 239 +> -- 87 < -- 0 -+ -- 34 -- -- 22 -* -- 138 -:= -- 337 -add.period$ -- 16 -call.type$ -- 12 -change.case$ -- 11 ++ -- 36 +- -- 23 +* -- 156 +:= -- 372 +add.period$ -- 17 +call.type$ -- 13 +change.case$ -- 12 chr.to.int$ -- 0 -cite$ -- 13 -duplicate$ -- 117 -empty$ -- 241 -format.name$ -- 22 -if$ -- 541 +cite$ -- 14 +duplicate$ -- 131 +empty$ -- 266 +format.name$ -- 23 +if$ -- 607 int.to.chr$ -- 0 -int.to.str$ -- 12 -missing$ -- 8 -newline$ -- 45 -num.names$ -- 11 -pop$ -- 54 +int.to.str$ -- 13 +missing$ -- 9 +newline$ -- 48 +num.names$ -- 12 +pop$ -- 55 preamble$ -- 1 purify$ -- 0 quote$ -- 0 -skip$ -- 55 +skip$ -- 67 stack$ -- 0 -substring$ -- 64 -swap$ -- 28 +substring$ -- 84 +swap$ -- 35 text.length$ -- 0 text.prefix$ -- 0 top$ -- 0 type$ -- 0 warning$ -- 1 -while$ -- 17 -width$ -- 14 -write$ -- 114 +while$ -- 20 +width$ -- 15 +write$ -- 124 (There was 1 warning) diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.log b/Report/Schrick-Noah_CS-7863_Final-Report.log index 5d5b15d..d57b8b5 100644 --- a/Report/Schrick-Noah_CS-7863_Final-Report.log +++ b/Report/Schrick-Noah_CS-7863_Final-Report.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex 2022.4.29) 2 MAY 2022 16:54 +This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex 2022.4.29) 2 MAY 2022 19:02 entering extended mode restricted \write18 enabled. %&-line parsing enabled. @@ -364,28 +364,38 @@ Underfull \hbox (badness 10000) in paragraph at lines 50--54 \OT1/cmr/m/n/10 Data Security Standard,'' May 2018. Available: [] + +Underfull \hbox (badness 1571) in paragraph at lines 62--64 +[]\OT1/cmr/m/n/10 L. Katz, ``A new status index derived from sociometric analys +is,'' + [] + ) [7 ] (./Schrick-Noah_CS-7863_Final-Report.aux) ) Here is how much of TeX's memory you used: - 5373 strings out of 478238 - 96520 string characters out of 5850456 - 398294 words of memory out of 5000000 - 23550 multiletter control sequences out of 15000+600000 + 5376 strings out of 478238 + 96547 string characters out of 5850456 + 398315 words of memory out of 5000000 + 23553 multiletter control sequences out of 15000+600000 473860 words of font info for 43 fonts, out of 8000000 for 9000 1141 hyphenation exceptions out of 8191 67i,8n,77p,1807b,289s stack positions out of 5000i,500n,10000p,200000b,80000s -Output written on Schrick-Noah_CS-7863_Final-Report.pdf (7 pages, 116347 bytes) +ts/cm/cmbx12.pfb> +Output written on Schrick-Noah_CS-7863_Final-Report.pdf (7 pages, 161981 bytes) . PDF statistics: - 58 PDF objects out of 1000 (max. 8388607) - 36 compressed objects within 1 object stream + 83 PDF objects out of 1000 (max. 8388607) + 51 compressed objects within 1 object stream 0 named destinations out of 1000 (max. 500000) 1 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.pdf b/Report/Schrick-Noah_CS-7863_Final-Report.pdf index ba62943..0a61404 100644 Binary files a/Report/Schrick-Noah_CS-7863_Final-Report.pdf and b/Report/Schrick-Noah_CS-7863_Final-Report.pdf differ diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.tex b/Report/Schrick-Noah_CS-7863_Final-Report.tex index d9a316d..d135728 100644 --- a/Report/Schrick-Noah_CS-7863_Final-Report.tex +++ b/Report/Schrick-Noah_CS-7863_Final-Report.tex @@ -65,9 +65,25 @@ The work conducted in this approach utilized three compliance graphs, with their \subsection{Introduction} The author of \cite{PMID:30064421} provides a survey of centrality measures, and discusses how various centrality measures have been implemented and brought forth in order to determine node importance in networks. By determining the importance of nodes, various conclusions can be drawn regarding the network. In the case of compliance graphs, conclusions can be drawn regarding the prioritization of patching or correction schemes. If one node is known to lead to the creation of many other nodes, it may be said that a patch is imperative to prevent further opportunities for compliance violation. This work discusses five centrality measures, and discusses their application to compliance graphs. \subsection{Degree} -Degree centrality is a trivial, localized measure of node importance based on the number of edges that a node has. In an undirected graph, the degree centrality is predicated solely on the number of edges. However, in the case of a directed graph, a distinction is drawn with a degree centrality oriented on the number of edges coming into a node, and another measure focused on the number of edges leaving a node. Both of these cases provide useful information for compliance graphs. When a node has a large number of other nodes it points to, this node may be prioritized since it creates further room for violation. When a node has a large number of edges pointing to it, this node may be prioritized since the probability that systems may enter this state is higher due to the increased number of ways that a system could lead to this state. +Degree centrality is a trivial, localized measure of node importance based on the number of edges that a node has. In an undirected graph, the degree centrality is predicated solely on the number of edges. However, in the case of a directed graph, a distinction is drawn with a degree centrality oriented on the number of edges coming into a node, and another measure focused on the number of edges leaving a node. Both of these cases provide useful information for compliance graphs. When a node has a large number of other nodes it points to, this node may be prioritized since it creates further opportunity for violation. When a node has a large number of edges pointing to it, this node may be prioritized since the probability that systems may enter this state is higher due to the increased number of ways that a system could lead to this state. \subsection{Betweenness} +Betweenness centrality ranks node importance based on its ability to transfer information flow in a network. For all pairs of nodes in a network, a shortest path is determined. A node that is in this shortest path is considered to have importance. The total betweenness centrality is based on the number of shortest paths that pass through a given node. For compliance graphs, the shortest paths are useful to identify the quickest way that systems may fall out of compliance. By prioritizing the nodes that fall in the highest number of shortest paths, correction schemes can be employed to prolong or prevent systems from falling out of compliance. + +Betweenness centrality is given in Equation \ref{eq:between}, where \textit{i} and \textit{j} are two different, individual nodes in the network, $\sigma_{ij}$ is the total number of shortest paths from \textit{i} to \textit{j}, and $\sigma _{ij}(v)$ is the number of shortest paths that include a node \textit{v}. + +\begin{equation} +\sum_{s \neq v \neq t} \frac{\sigma_{ij}(v)}{\sigma_{ij}} +\label{eq:between} +\end{equation} + \subsection{Katz} +Katz centrality was first introduced by the author of \cite{Katz}, and measures the importance of nodes through all paths in a network, and is not limited to solely the shortest path between any two given nodes. The original work by the author defines Katz as seen in Equation \ref{eq:Katz}. + +\begin{equation} +C_{\mathrm {Katz} }(i)=\sum _{k=1}^{\infty }\sum _{j=1}^{n}\alpha ^{k}(A^{k})_{ji} +\label{eq:Katz} +\end{equation} + \subsection{K-Path Edge} \subsection{Adapted Page Rank} diff --git a/Report/Schrick-Noah_CS-7863_Final-Report.toc b/Report/Schrick-Noah_CS-7863_Final-Report.toc index a9e4728..dd94fa2 100644 --- a/Report/Schrick-Noah_CS-7863_Final-Report.toc +++ b/Report/Schrick-Noah_CS-7863_Final-Report.toc @@ -8,7 +8,7 @@ \contentsline {section}{\numberline {4}Centralities}{4}{}% \contentsline {subsection}{\numberline {4.1}Introduction}{4}{}% \contentsline {subsection}{\numberline {4.2}Degree}{5}{}% -\contentsline {subsection}{\numberline {4.3}Betweenness}{6}{}% +\contentsline {subsection}{\numberline {4.3}Betweenness}{5}{}% \contentsline {subsection}{\numberline {4.4}Katz}{6}{}% \contentsline {subsection}{\numberline {4.5}K-Path Edge}{6}{}% \contentsline {subsection}{\numberline {4.6}Adapted Page Rank}{6}{}% diff --git a/Schrick-Noah_CG-Analysis.R b/Schrick-Noah_CG-Analysis.R index c4e04fa..dd3a4cd 100644 --- a/Schrick-Noah_CG-Analysis.R +++ b/Schrick-Noah_CG-Analysis.R @@ -45,9 +45,25 @@ base_centralities[[3,1]] <- pci.deg %>% sort(decreasing = T) #### Katz car.katz <- katz.cent(car) +nodes <- car.katz %>% order(decreasing=T) +nodes <- head(nodes, 15)-1 +vals <- car.katz %>% sort(decreasing=T) +vals <- head(vals, 15) base_centralities[[1,2]] <- car.katz[rowSums(apply(car.katz,2,is.nan))==0,] %>% sort(decreasing = T) + base_centralities[[2,2]] <- katz.cent(hipaa) %>% sort(decreasing = T) +hipaa.katz <- katz.cent(hipaa) +nodes <- hipaa.katz %>% order(decreasing=T) +nodes <- head(nodes, 15)-1 +vals <- hipaa.katz %>% sort(decreasing=T) +vals <- head(vals, 15) + base_centralities[[3,2]] <- katz.cent(pci) %>% sort(decreasing = T) +pci.katz <- katz.cent(pci) +nodes <- pci.katz %>% order(decreasing=T) +nodes <- head(nodes, 15)-1 +vals <- pci.katz %>% sort(decreasing=T) +vals <- head(vals, 15) ### Page Rank base_centralities[[1,3]] <- page.rank(car)$vector %>% sort(decreasing = T) @@ -159,8 +175,25 @@ tc_centralities[[3,1]] <- pci.tc.deg %>% sort(decreasing = T) #### Katz car.tc.katz <- katz.cent(car.tc) tc_centralities[[1,2]] <- car.tc.katz[rowSums(apply(car.tc.katz,2,is.nan))==0,] %>% sort(decreasing = T) +car.tc.katz <- katz.cent(car.tc) +nodes <- car.tc.katz %>% order(decreasing=T) +nodes <- head(nodes, 15)-1 +vals <- car.tc.katz %>% sort(decreasing=T) +vals <- head(vals, 15) + tc_centralities[[2,2]] <- katz.cent(hipaa.tc) %>% sort(decreasing = T) +hipaa.tc.katz <- katz.cent(hipaa.tc) +nodes <- hipaa.tc.katz %>% order(decreasing=T) +nodes <- head(nodes, 15)-1 +vals <- hipaa.tc.katz %>% sort(decreasing=T) +vals <- head(vals, 15) + tc_centralities[[3,2]] <- katz.cent(pci.tc) %>% sort(decreasing = T) +pci.tc.katz <- katz.cent(pci.tc) +nodes <- pci.tc.katz %>% order(decreasing=T) +nodes <- head(nodes, 15)-1 +vals <- pci.tc.katz %>% sort(decreasing=T) +vals <- head(vals, 15) ### Page Rank tc_centralities[[1,3]] <- page.rank(car.tc)$vector %>% sort(decreasing = T) diff --git a/centralities.R b/centralities.R index 067d14c..a09e90e 100644 --- a/centralities.R +++ b/centralities.R @@ -6,7 +6,10 @@ katz.cent <- function(A, alpha=NULL, beta=NULL){ #NULL sets the default value lam.dom <- eigen(A)$values[1] #dom eigenvec if (is.null(alpha)){ - alpha <- 0.9 * (1/lam.dom) #Set alpha to 90% of max allowed + if(lam.dom == 0) + alpha = 0.1 + else + alpha <- 0.9 * (1/lam.dom) #Set alpha to 90% of max allowed if (is.complex(alpha)){ alpha <- Re(alpha) } @@ -19,6 +22,6 @@ katz.cent <- function(A, alpha=NULL, beta=NULL){ #NULL sets the default value #Katz scores scores <- solve(diag(n) - alpha*A,beta) - + return(scores) }