From 3b546f5a765e314811926b91df9a121a760e8f99 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sun, 11 Nov 2018 20:27:36 -0500 Subject: [PATCH 01/31] brute force effort at grabbing levels --- Data Prep/Comparison_functions.R | 117 +++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 Data Prep/Comparison_functions.R diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R new file mode 100644 index 0000000..ba6c6d0 --- /dev/null +++ b/Data Prep/Comparison_functions.R @@ -0,0 +1,117 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets + +# (c) 2017 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Absolutely no warranty! +########################################################################################################## + + +library(dplyr) + +get_date_subsets <-function(c) { +b1 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') +b2 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') +b3 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') +b4 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') +b5 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') + +rc1 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') +rc2 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') +rc3 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') +rc4 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') +rc5 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') + +smh1 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') +smh2 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') +smh3 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') +smh4 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') +smh5 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') + +hh1 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') +hh2 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') +hh3 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') +hh4 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') +hh5 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') + +} + +# c is for the clinic +# m is the metric +compare_periods <-function(c,m){ + + # b1 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') + # b2 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') + # b3 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') + # b4 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') + # b5 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') + + + a1=routine_metric(b1,'Action',m ) + a2=routine_metric(b2,'Action',m ) + a3=routine_metric(b3,'Action',m ) + a4=routine_metric(b4,'Action',m ) + a5=routine_metric(b5,'Action',m ) + + r1=routine_metric(b1,'Role',m ) + r2=routine_metric(b2,'Role',m ) + r3=routine_metric(b3,'Role',m ) + r4=routine_metric(b4,'Role',m ) + r5=routine_metric(b5,'Role',m ) + + ws1=routine_metric(b1,'Workstation',m ) + ws2=routine_metric(b2,'Workstation',m ) + ws3=routine_metric(b3,'Workstation',m ) + ws4=routine_metric(b4,'Workstation',m ) + ws5=routine_metric(b5,'Workstation',m ) + + rc_a1=routine_metric(rc1,'Action',m ) + rc_a2=routine_metric(rc2,'Action',m ) + rc_a3=routine_metric(rc3,'Action',m ) + rc_a4=routine_metric(rc4,'Action',m ) + rc_a5=routine_metric(rc5,'Action',m ) + + smh_a1=routine_metric(smh1,'Action',m ) + smh_a2=routine_metric(smh2,'Action',m ) + smh_a3=routine_metric(smh3,'Action',m ) + smh_a4=routine_metric(smh4,'Action',m ) + smh_a5=routine_metric(smh5,'Action',m ) + + hh_a1=routine_metric(hh1,'Action',m ) + hh_a2=routine_metric(hh2,'Action',m ) + hh_a3=routine_metric(hh3,'Action',m ) + hh_a4=routine_metric(hh4,'Action',m ) + hh_a5=routine_metric(hh5,'Action',m ) + + lex=data.frame(c(a1,a2,a3,a4,a5), + c(hh_a1,hh_a2,hh_a3,hh_a4,hh_a5), + c(rc_a1,rc_a2,rc_a3,rc_a4,rc_a5), + c(smh_a1,smh_a2,smh_a3,smh_a4,smh_a5) + ) + + ws1=ws1[ws1>0]/nrow(b1) + ws2=ws2[ws2>0]/nrow(b2) + ws3=ws3[ws3>0]/nrow(b3) + +} + + +# o is the list of occurrences +# m is the metric we want to see +routine_metric <- function(o, cf, m){ + + switch(m, + len = nrow(o), + lex_size = length(unique(o[[cf]])), + first_order = table(o[[cf]]), + first_order_sort = sort( table(o[[cf]]), decreasing = T) + + ) + +} + +get_range <- function(o, start_date, end_date){ + + o %>% filter(as.Date(tStamp) > start_date & as.Date(tStamp) < end_date) + +} From d5fa0dda420b6af7652de85d7a5aa10b90d213fc Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 15 Nov 2018 12:51:03 -0500 Subject: [PATCH 02/31] Just trying to learn how this works... --- ParallelTest/ptest.R | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 ParallelTest/ptest.R diff --git a/ParallelTest/ptest.R b/ParallelTest/ptest.R new file mode 100644 index 0000000..3957931 --- /dev/null +++ b/ParallelTest/ptest.R @@ -0,0 +1,26 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets + +# (c) 2017 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Absolutely no warranty! +########################################################################################################## + +library(parallel) + +ptest <-function(o) { + x<<-0 + oo<<-o + + l=1:nrow(o) + # return( sapply(l,f1 ) ) + + new_o = cbind(o, sapply(l,f1 ) ) + return(new_o) +} + +f1 <- function(r){ + x<<-x+1 + return(as.character(oo[r,5])) +} + From a7e985ed0b9c4ef3b700b68dfd2cdcac8cc1187f Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 15 Nov 2018 15:38:03 -0500 Subject: [PATCH 03/31] First draft... --- Data Prep/ACHR_Batch_PreProcess.R | 541 ++++++++++++++++++++++++++++++ 1 file changed, 541 insertions(+) create mode 100644 Data Prep/ACHR_Batch_PreProcess.R diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R new file mode 100644 index 0000000..b02ee13 --- /dev/null +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -0,0 +1,541 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets +# +# ACHR_Batch_PreProcess +# +# (c) 2018 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Brian Pentland +# Absolutely no warranty! +########################################################################################################## + + +# Separate out the two basicfunctions +# 1) reading/cleaning the occurrences +# 2) Threading the data by adding ThreadNum and SeqNum to the threads + + + +# this function reads the raw data from URMC +# It returns the data frame and also saves it as an Rdata file with the same name as the csv +read_ACHR_data <- function(fname){ + + library(tidyr) + library(data.table) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(expss) + + + # This code is tailored for reading in new data from URMC, October 2018 + # Assumes there is a column called 'Timestamps" in column 13 and "Visit_ID" + # read the file into data frame + d <- fread( paste0(fname, '.csv') ) + + # Sort by visit and timestamp + d <- arrange(d,desc(Visit_ID,asc(Timestamps))) + + # move and rename the columns, as needed + # Mainly need to get the Timestamps into the first column and rename them tStamp + cn=colnames(d) + cnx = cn[c(13,1:12,14:15)] + d=setcolorder(d,cnx) + setnames(d,'Timestamps','tStamp') + setnames(d,'X','seqn') + + # This converts numbers to char and replaces spaces with underscore + d = cleanOccBatch(d) + + save(d, file=paste0(fname, '.rData')) + + return(d) + +} + +# clean up the raw occurrence data +# Remove blanks for n-gram functionality +cleanOccBatch <- function(fileRows){ + + # extract tStamp + tStamp <- fileRows$tStamp + + # First get rid of any underscores pre-existing in the data, then + # confirm all spaces are converted to underscores in non tStamp columns; set as factors + cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub("_","",x) + gsub(" ","_",x)})) + + # bind tStamp back to cleaned data + complete <- cbind(tStamp,cleanedCF) + + + # Old code for forcing tStamp into a "YMD_HMS" format -- not needed for URMC EMR data + # complete$tStamp <- as.character(complete$tStamp) + # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM")) + + # add weekday and month + # complete$weekday <- as.factor(weekdays(as.Date(complete$tStamp))) + # complete$month <- as.factor(months(as.Date(complete$tStamp))) + + return(complete) +} + + +# This function adds columns for the the thread, if necessary +# Then it sorts by thread and tStamp and adds thread/sequence numbers to each thread. +# It returns the threaded set of occurrences and also saves it as Rdata. +thread_occurrences <- function(o,thread_CF,fname='emr'){ + + # call the functions that clean up the occurrences + # Column names are hard coded from the UMRC file + # EVENT_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group') + EVENT_CF = c('Action','Role','Workstation') + TN = 'Visit_ID' + + + + ### Code for Visit_Role ### + + # need to add Visit_Role column and use it to create threads. Each Visit_Role needs a unique threadNum and seqNum + # This is likely to be tricky + new_occ_VR = unite(new_occ, 'Visit_Role', c('Visit_ID','Role'),sep='_',remove = 'false') + + # add two columns to the data frame + new_occ_VR$threadNum = integer(nrow(new_occ_VR)) + new_occ_VR$seqNum = integer(nrow(new_occ_VR)) + new_occ_VR = new_occ_VR[order(new_occ_VR$Visit_Role,new_occ_VR$tStamp),] + tn<<-0 + + # new_occ_VR = new_occ_VR[1:1000,] + + pov_list = unique(new_occ_VR$Visit_Role) + + + start_row=1 + thrd=1 + for (p in pov_list){ + + # get the length of the thread + tlen = sum(new_occ_VR$Visit_Role==p) + + # guard against error + if (length(tlen)==0) tlen=0 + if (tlen>0){ + + #compute the index of the end row + end_row = start_row+tlen-1 + + # they all get the same thread number and incrementing seqNum + new_occ_VR[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen)) + new_occ_VR[start_row:end_row, "seqNum"] <- seq(tlen) + + # increment the counters for the next thread + start_row = end_row + 1 + thrd=thrd+1 + } + + } + + # # split occ data frame by threadNum to find earliest time value for that thread + # # then substract that from initiated relativeTime from above + # occ_split = lapply(split(occ, occ$threadNum), + # function(x) {x$relativeTime = difftime(x$relativeTime, min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x}) + # + # # # row bind data frame back together + # occ= data.frame(do.call(rbind, occ_split)) + + save(new_occ_VR, file='auditfinal_VR_10292018.rData') + + + + # ThreadNet code assumes these columns will be there, so we need to add them + # do not use these for visit_role + # occ$threadNum = occ$Visit_ID + # occ$seqNum = occ$seqn + + + # because the occurrences are already sorted, and they have sequence numbers, we should be good to go. + # new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF))[1:100000,] + new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF)) + new_occ = combineContextFactors(new_occ,EVENT_CF[1:2],newColName(EVENT_CF[1:2])) + + # get the date only from the timestamp + new_occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d") + + # Rename one of the clinics so it doesn't have an underscore. + new_occ$Clinic = gsub('HH_POB','HHPOB',new_occ$Clinic) + + # make new columns for clinic + day and events + new_occ = unite(new_occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') + + + + # save this intermediate result for later use + save(new_occ, file='auditfinal_10022018.rData') + + + # RUN THESE TO BOIL/SAVE THE COMPLEXITY DATA + clinic_days = ACHR_batch_clinic_days(new_occ, TN, EVENT_CF) + save(clinic_days, file='ACHRV3_clinic_days.rData') + + visits = ACHR_batch_visits(new_occ, TN, EVENT_CF) + save(visits, file='ACHRV3_visits_with_timestamps.rData') + +} + + + + +################################################################### +################################################################### +ACHR_batch_clinic_days <- function(occ,TN, CFs) { + + # Name for column that has events -- three variations + DV1= newColName(CFs[1]) + DV2= newColName(CFs[1:2]) + DV3= newColName(CFs[1:3]) + + print(DV1) + print(DV2) + print(DV3) + + # name for column that defines the buckets + BU = 'Clinic_ymd' + + # first get the date only +# occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d") + + # make new columns as needed for clinic + day and events +# occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') + + + + # pick subsets -- one visit at a time in this version, but could be more + bucket_list <- unique(occ[['Clinic_ymd']]) + + # get the size (number of buckets) + N = length(bucket_list) + print(N) + + # pre-allocate the data.table. Tables are supposed to be faster. + ACHR = data.table(bucket=integer(N), + Clinic_date = character(N), + YMD_date = character(N), + NEvents = integer(N), + NumVisits = numeric(N), + A_NetComplexity=double(N), + AR_NetComplexity=double(N), + ARW_NetComplexity=double(N), + A_Nodes=double(N), + AR_Nodes=double(N), + ARW_Nodes=double(N), + A_Edges=double(N), + AR_Edges=double(N), + ARW_Edges=double(N), + A_CompressRatio = double(N), + AR_CompressRatio = double(N), + ARW_CompressRatio = double(N), + A_Entropy = double(N), + AR_Entropy = double(N), + ARW_Entropy = double(N), + Clinic = character(N), + NumUniqueProcedures = numeric(N), + NumUniqueDiagnosisGroups = numeric(N), + NumPhysicians = numeric(N), + Weekday = character(N), + Month = character(N) + ) + + # Now add columns for the IVs. There will be three for each IV + + # Add the IV columns + for (cf in CFs){ + + ACHR[, paste0(cf,"_count"):= double(N)] + ACHR[, paste0(cf,"_compression"):= double(N)] + ACHR[, paste0(cf,"_entropy"):= double(N)] + + } + + # loop through the buckets. Result will be data frame with one row per bucket + for (i in 1:N){ + + b = i # as.integer(bucket_list[i]) + + # print once every 10 buckets + if (b%%10==0) {print(b)} + + # select the threads that go in this bucket + df = occ[occ[[BU]] ==bucket_list[i],] + + # bucket number + ACHR[b,bucket := b] + + # length of the thread (number of rows) + ACHR[b,NEvents := nrow(df)] + + # only do the computations if there are more than two occurrences + if (nrow(df) > 2) { + + # compute the duration of the visit in hours + # ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] + + # compressibility of DV + ACHR[b,A_CompressRatio := compression_index(df,DV1)] + ACHR[b,AR_CompressRatio := compression_index(df,DV2)] + ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] + + # NetComplexity of DV + # First get the network + n = threads_to_network_original(df,TN, DV1) + ACHR[b,A_NetComplexity := estimate_network_complexity( n )] + ACHR[b,A_Nodes := nrow(n$nodeDF) ] + ACHR[b,A_Edges := nrow(n$edgeDF) ] + + n = threads_to_network_original(df,TN, DV2) + ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + ACHR[b,AR_Edges := nrow(n$edgeDF) ] + + n = threads_to_network_original(df,TN, DV3) + ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + + # get the entropy for AR and ARW + ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + + + # compute stuff on each context factor + for (cf in CFs){ + + # Count the unique elements in each cf + ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] + + # get the compression + ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] + + # get the entropy + ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + + } + } # kf nrows > 2 + + # Now copy in the rest of data + # this works because one visit is one bucket + # count the number of different diagnoses --> typical indicators of complexity + + + ACHR[b,'Clinic_date' := df[1,'Clinic_ymd']] + ACHR[b,'YMD_date' := df[1,'ymd']] + ACHR[b,'NumVisits' := length(unique(df[['Visit_ID']]))] + ACHR[b,'Clinic' := df[1,'Clinic']] + ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))] + ACHR[b,'NumUniqueDiagnosisGroups' := length(unique(df[['Diagnosis_Group']]))] + ACHR[b,'NumPhysicians' := length(unique(df[['Physician']]))] + ACHR[b,'Weekday' := df[1,'Weekday']] + ACHR[b,'Month' := df[1,'Month']] + + + + } # loop thru buckets + + # copy this one for consistency + ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] + + # return the table + return(ACHR) +} + +################################################################### +################################################################### + + + +# ACHR stands for Antecedents of Complexity in Healthcare Routines. +# This is function is set up to compute process parameters on thousands of patient visits. +ACHR_batch_visits <- function(occ,TN, CFs) { + + DV1= newColName(CFs[1]) + DV2= newColName(CFs[1:2]) + DV3= newColName(CFs[1:3]) + + print(DV1) + print(DV2) + print(DV3) + +# pick subsets -- one visit at a time in this version, but could be more +bucket_list <- make_buckets_1(occ, 'Visit_ID') + +# get the size (number of buckets) +N = length(bucket_list) +print(bucket_list) + + +# pre-allocate the data.table. Tables are supposed to be faster. +ACHR = data.table(bucket=integer(N), + NEvents = integer(N), + VisitStart= character(N), + VisitStartInt = integer(N), + VisitDuration=double(N), + A_NetComplexity=double(N), + AR_NetComplexity=double(N), + ARW_NetComplexity=double(N), + A_Nodes=double(N), + AR_Nodes=double(N), + ARW_Nodes=double(N), + A_Edges=double(N), + AR_Edges=double(N), + ARW_Edges=double(N), + A_CompressRatio = double(N), + AR_CompressRatio = double(N), + ARW_CompressRatio = double(N), + A_Entropy = double(N), + AR_Entropy = double(N), + ARW_Entropy = double(N), + Visit_ID = character(N), + Subject_ID = character(N), + Clinic = character(N), + Procedure = character(N), + Diagnosis = character(N), # diag in the raw data + Diagnosis_group = character(N), + Physician = character(N), + Weekday = character(N), + Month = character(N) + ) + +# Now add columns for the IVs. There will be three for each IV + +# Add the IV columns +for (cf in CFs){ + + ACHR[, paste0(cf,"_count"):= double(N)] + ACHR[, paste0(cf,"_compression"):= double(N)] + ACHR[, paste0(cf,"_entropy"):= double(N)] + +} + +# loop through the buckets. Result will be data frame with one row per bucket +for (i in 1:N){ + + b = i # as.integer(bucket_list[i]) + + # print once every 100 visits + if (b%%100==0) {print(b)} + + # select the threads that go in this bucket + df = occ[occ[[TN]] ==bucket_list[i],] + + # bucket number + ACHR[b,bucket := b] + + # length of the thread (number of rows) + ACHR[b,NEvents := nrow(df)] + + # only do the computations if there are more than two occurrences + if (nrow(df) > 2) { + + # compute the duration of the visit in hours + ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] + + # compressibility of DV + ACHR[b,A_CompressRatio := compression_index(df,DV1)] + ACHR[b,AR_CompressRatio := compression_index(df,DV2)] + ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] + + # NetComplexity of DV + # First get the network + # NetComplexity of DV + # First get the network + n = threads_to_network_original(df,TN, DV1) + ACHR[b,A_NetComplexity := estimate_network_complexity( n )] + ACHR[b,A_Nodes := nrow(n$nodeDF) ] + ACHR[b,A_Edges := nrow(n$edgeDF) ] + + n = threads_to_network_original(df,TN, DV2) + ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + ACHR[b,AR_Edges := nrow(n$edgeDF) ] + + n = threads_to_network_original(df,TN, DV3) + ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + + # get the entropy for AR and ARW + ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + + + # compute stuff on each context factor + for (cf in CFs){ + + # Count the unique elements in each cf + ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] + + # get the compression + ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] + + # get the entropy + ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + + } +} # kf nrows > 2 + + # Now copy in the rest of data + # this only works because one visit is one bucket + + # ACHR[b,'VisitStart' := as.POSIXct( df[1,'tStamp']) ] + ACHR[b,'VisitStartInt' := df[1,'tStamp'] ] + + ACHR[b,'VisitStart' := as.character( df[1,'tStamp'] ) ] + + ACHR[b,'Visit_ID' := df[1,'Visit_ID']] + ACHR[b,'Subject_ID' := df[1,'Subject_ID']] + ACHR[b,'Clinic' := df[1,'Clinic']] + ACHR[b,'Procedure' := df[1,'Proc']] + ACHR[b,'Diagnosis' := df[1,'Diag']] + ACHR[b,'Diagnosis_group' := df[1,'Diagnosis_Group']] + ACHR[b,'Physician' := df[1,'Physician']] + ACHR[b,'Weekday' := df[1,'Weekday']] + ACHR[b,'Month' := df[1,'Month']] + + + +} # loop thru buckets + +# copy this one for consistency +ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] + +# return the table +return(ACHR) +} + + +# Each bucket is a list of thread numbers that can be used to subset the list of occurrences +make_buckets_1 <- function(o, criteria){ + + return( unique(o[[criteria]]) ) + +} + + + + + +# +# make_box_plots <- function(){ +# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity", +# color = "VisitDay", +# ylab = "Complexity", xlab = "Month (DRH)") +# } +# + + + + + +get_timeScale <- function(){'hr'} + From 6f18e3c3328982b2cb8f32c312e42ef0b2c41df5 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 15 Nov 2018 16:03:41 -0500 Subject: [PATCH 04/31] Making progress... --- Data Prep/ACHR_Batch_PreProcess.R | 59 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index b02ee13..a721d94 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -81,36 +81,45 @@ cleanOccBatch <- function(fileRows){ return(complete) } - -# This function adds columns for the the thread, if necessary +######################################################################################### +# This function adds columns for the the thread, as requested # Then it sorts by thread and tStamp and adds thread/sequence numbers to each thread. +# +# Typical column names from the UMRC file +# ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group') +# thread_CF = c('Visit_ID', 'Role', 'Workstation') +# event_CF = c('Action','Role','Workstation') +# TN = 'Visit_ID' # It returns the threaded set of occurrences and also saves it as Rdata. -thread_occurrences <- function(o,thread_CF,fname='emr'){ +thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ - # call the functions that clean up the occurrences - # Column names are hard coded from the UMRC file - # EVENT_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group') - EVENT_CF = c('Action','Role','Workstation') - TN = 'Visit_ID' - - + # these will be new column names + new_thread_name = newColName(THREAD_CF) + new_event_name = newColName(EVENT_CF) - ### Code for Visit_Role ### + # Add names for the new columns, if necessary + if (!(new_thread_name %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CF,new_thread_name) } + if (!(new_event_name %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_name) } - # need to add Visit_Role column and use it to create threads. Each Visit_Role needs a unique threadNum and seqNum - # This is likely to be tricky - new_occ_VR = unite(new_occ, 'Visit_Role', c('Visit_ID','Role'),sep='_',remove = 'false') - # add two columns to the data frame + # ThreadNet code assumes these columns will be there, so we need to add them new_occ_VR$threadNum = integer(nrow(new_occ_VR)) new_occ_VR$seqNum = integer(nrow(new_occ_VR)) - new_occ_VR = new_occ_VR[order(new_occ_VR$Visit_Role,new_occ_VR$tStamp),] - tn<<-0 - # new_occ_VR = new_occ_VR[1:1000,] + # Special case for Visit_ID, which is already filled in URMC EMR data + if (THREAD_CF == 'VISIT_ID') { + occ$threadNum = occ$Visit_ID + occ$seqNum = occ$seqn + } + + # Now sort the data set by the new threadNum and tStamp + occ = occ[order(occ[[new_thread_name]],occ$tStamp),] - pov_list = unique(new_occ_VR$Visit_Role) - + tn<<-0 + + # get the list of unique identifies for the threads. The length of this list is the number of threads + pov_list = unique(occ[[new_thread_name]]) + print(paste('Number of threads in this POV: ', length(pov_list))) start_row=1 thrd=1 @@ -148,17 +157,9 @@ thread_occurrences <- function(o,thread_CF,fname='emr'){ save(new_occ_VR, file='auditfinal_VR_10292018.rData') - - # ThreadNet code assumes these columns will be there, so we need to add them - # do not use these for visit_role - # occ$threadNum = occ$Visit_ID - # occ$seqNum = occ$seqn + - # because the occurrences are already sorted, and they have sequence numbers, we should be good to go. - # new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF))[1:100000,] - new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF)) - new_occ = combineContextFactors(new_occ,EVENT_CF[1:2],newColName(EVENT_CF[1:2])) # get the date only from the timestamp new_occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d") From 92f6d47fba49b00871a0cd0bb4f2a55be6638e76 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 15 Nov 2018 17:39:42 -0500 Subject: [PATCH 05/31] Making progress... --- Data Prep/ACHR_Batch_PreProcess.R | 104 ++++++++++++++++++------------ 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index a721d94..4f99de3 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -14,7 +14,25 @@ # 1) reading/cleaning the occurrences # 2) Threading the data by adding ThreadNum and SeqNum to the threads +redo_ACHR_data_from_scratch <- function(fname){ + # Pick you point of view, or multiple POV... + THREAD_CF = c('VISIT_ID') + EVENT_CF = c('Action','Role','Workstation') + + # first read the data + o = read_ACHR_data( fname ) + + # Thread occurrences for each POV + occ = thread_occurrences( o, THREAD_CF, EVENT_CF ,fname ) + + # Now aggregate by clinic-day for each POV + clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname) + + # Aggregate by visit for each POV + visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname) + +} # this function reads the raw data from URMC # It returns the data frame and also saves it as an Rdata file with the same name as the csv @@ -93,14 +111,18 @@ cleanOccBatch <- function(fileRows){ # It returns the threaded set of occurrences and also saves it as Rdata. thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ + print(paste('Number of occurrences: ', nrow(occ))) + # these will be new column names - new_thread_name = newColName(THREAD_CF) - new_event_name = newColName(EVENT_CF) + new_thread_col = newColName(THREAD_CF) + new_event_col = newColName(EVENT_CF) - # Add names for the new columns, if necessary - if (!(new_thread_name %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CF,new_thread_name) } - if (!(new_event_name %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_name) } + print(paste('new_thread_col: ',new_thread_col)) + print(paste('new_event_col: ', new_event_col)) + # Add names for the new columns, if necessary + if (!(new_thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CF,new_thread_col) } + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_col) } # ThreadNet code assumes these columns will be there, so we need to add them new_occ_VR$threadNum = integer(nrow(new_occ_VR)) @@ -110,15 +132,21 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ if (THREAD_CF == 'VISIT_ID') { occ$threadNum = occ$Visit_ID occ$seqNum = occ$seqn - } + + # say how many there are... + print(paste('Number of threads (visits) in this POV: ', length(unique( occ$Visit_ID )))) + + # Now sort the data set by the new threadNum and tStamp + occ = occ[order(occ[[new_thread_col]],occ$tStamp),] + + } + else { # Now sort the data set by the new threadNum and tStamp - occ = occ[order(occ[[new_thread_name]],occ$tStamp),] - - tn<<-0 + occ = occ[order(occ[[new_thread_col]],occ$tStamp),] # get the list of unique identifies for the threads. The length of this list is the number of threads - pov_list = unique(occ[[new_thread_name]]) + pov_list = unique(occ[[new_thread_col]]) print(paste('Number of threads in this POV: ', length(pov_list))) start_row=1 @@ -126,7 +154,7 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ for (p in pov_list){ # get the length of the thread - tlen = sum(new_occ_VR$Visit_Role==p) + tlen = sum(occ[[new_thread_col]]==p) # guard against error if (length(tlen)==0) tlen=0 @@ -136,16 +164,36 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ end_row = start_row+tlen-1 # they all get the same thread number and incrementing seqNum - new_occ_VR[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen)) - new_occ_VR[start_row:end_row, "seqNum"] <- seq(tlen) + occ[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen)) + occ[start_row:end_row, "seqNum"] <- seq(tlen) # increment the counters for the next thread start_row = end_row + 1 thrd=thrd+1 + + if (thrd %% 1000 ==0) print(paste0('Thread count = ',thrd)) + } - } + } + + ### This is URMC specific: For all POV combinations, add the clinic and clinic_day + + # get the date only from the timestamp + occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d") + + # make new columns for clinic + day and events + occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') + + + # Now save the result for later and return it, as well. + save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) + print(paste('Saved threaded occurrences: ', nrow(occ))) + + return(occ) + + ############ extra stuff not used ############# # # split occ data frame by threadNum to find earliest time value for that thread # # then substract that from initiated relativeTime from above # occ_split = lapply(split(occ, occ$threadNum), @@ -153,36 +201,8 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ # # # # row bind data frame back together # occ= data.frame(do.call(rbind, occ_split)) - - save(new_occ_VR, file='auditfinal_VR_10292018.rData') - - - - - - - # get the date only from the timestamp - new_occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d") - - # Rename one of the clinics so it doesn't have an underscore. - new_occ$Clinic = gsub('HH_POB','HHPOB',new_occ$Clinic) - - # make new columns for clinic + day and events - new_occ = unite(new_occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') - - - # save this intermediate result for later use - save(new_occ, file='auditfinal_10022018.rData') - - # RUN THESE TO BOIL/SAVE THE COMPLEXITY DATA - clinic_days = ACHR_batch_clinic_days(new_occ, TN, EVENT_CF) - save(clinic_days, file='ACHRV3_clinic_days.rData') - - visits = ACHR_batch_visits(new_occ, TN, EVENT_CF) - save(visits, file='ACHRV3_visits_with_timestamps.rData') - } From da487911dac226b84120a225f7258831b99adc48 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sun, 18 Nov 2018 22:05:02 -0500 Subject: [PATCH 06/31] Making progress --- .gitignore | 1 + Data Prep/.gitignore | 1 + Data Prep/ACHR_Batch_PreProcess.R | 448 +++--------------------------- Data Prep/ACHR_Batch_V2.R | 8 +- 4 files changed, 52 insertions(+), 406 deletions(-) create mode 100644 Data Prep/.gitignore diff --git a/.gitignore b/.gitignore index dd0b92d..62fdb39 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ vignettes/*.pdf rsconnect/ >>>>>>> be0331537859f5addf650269855aa0a9d49ca5d5 .Rproj.user +emrt+Visit_ID+Role_Action.Rdata diff --git a/Data Prep/.gitignore b/Data Prep/.gitignore new file mode 100644 index 0000000..8ccf702 --- /dev/null +++ b/Data Prep/.gitignore @@ -0,0 +1 @@ +ACHR_Batch_Aggregate_Data.R diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index 4f99de3..ef43e23 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -7,12 +7,13 @@ # GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? # Brian Pentland # Absolutely no warranty! -########################################################################################################## - - -# Separate out the two basicfunctions +# +# Separate out the two basic functions # 1) reading/cleaning the occurrences # 2) Threading the data by adding ThreadNum and SeqNum to the threads +# +########################################################################################################## + redo_ACHR_data_from_scratch <- function(fname){ @@ -31,11 +32,12 @@ redo_ACHR_data_from_scratch <- function(fname){ # Aggregate by visit for each POV visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname) - + } # this function reads the raw data from URMC # It returns the data frame and also saves it as an Rdata file with the same name as the csv +# Tested Nov 16. read_ACHR_data <- function(fname){ library(tidyr) @@ -44,29 +46,44 @@ read_ACHR_data <- function(fname){ library(ThreadNet) library(ngram) library(lubridate) - library(expss) + library(anytime) # This code is tailored for reading in new data from URMC, October 2018 # Assumes there is a column called 'Timestamps" in column 13 and "Visit_ID" # read the file into data frame - d <- fread( paste0(fname, '.csv') ) + d <- fread( paste0(fname, '.csv') ) # Sort by visit and timestamp - d <- arrange(d,desc(Visit_ID,asc(Timestamps))) + d <- arrange(d,desc(Visit_ID,asc(Timestamps))) + ################################################################################## + # This stuff is URMC specific + # # move and rename the columns, as needed # Mainly need to get the Timestamps into the first column and rename them tStamp cn=colnames(d) cnx = cn[c(13,1:12,14:15)] d=setcolorder(d,cnx) setnames(d,'Timestamps','tStamp') - setnames(d,'X','seqn') - + setnames(d,'V1','seqn') + # This converts numbers to char and replaces spaces with underscore d = cleanOccBatch(d) - save(d, file=paste0(fname, '.rData')) + ### This is URMC specific: Add the clinic and clinic_day and fix the code for the highland clinic + + # get the date only from the timestamp + d$ymd <- format( as.POSIXct(d$tStamp, format="%Y-%m-%d"), "%Y-%m-%d" ) + + # make new columns for clinic + day and events + d = unite(d, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') + + # Fix the name of the Highland clinic + d$Clinic = gsub('HH_POB','HHPOB',d$Clinic) + + # Save the result + save(d, file=paste0(fname, '.rData')) return(d) @@ -81,17 +98,15 @@ cleanOccBatch <- function(fileRows){ # First get rid of any underscores pre-existing in the data, then # confirm all spaces are converted to underscores in non tStamp columns; set as factors - cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub("_","",x) - gsub(" ","_",x)})) + cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub(" ","_",x) }) ) # bind tStamp back to cleaned data complete <- cbind(tStamp,cleanedCF) - # Old code for forcing tStamp into a "YMD_HMS" format -- not needed for URMC EMR data - # complete$tStamp <- as.character(complete$tStamp) - # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM")) - + # complete$tStamp <- as.character(complete$tStamp) + # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM")) + # add weekday and month # complete$weekday <- as.factor(weekdays(as.Date(complete$tStamp))) # complete$month <- as.factor(months(as.Date(complete$tStamp))) @@ -100,47 +115,34 @@ cleanOccBatch <- function(fileRows){ } ######################################################################################### -# This function adds columns for the the thread, as requested +# This function adds columns for the the thread, as requested ( Tested Nov 16 ) # Then it sorts by thread and tStamp and adds thread/sequence numbers to each thread. # # Typical column names from the UMRC file # ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group') # thread_CF = c('Visit_ID', 'Role', 'Workstation') # event_CF = c('Action','Role','Workstation') -# TN = 'Visit_ID' + # It returns the threaded set of occurrences and also saves it as Rdata. -thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ +thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ print(paste('Number of occurrences: ', nrow(occ))) # these will be new column names new_thread_col = newColName(THREAD_CF) - new_event_col = newColName(EVENT_CF) + # new_event_col = newColName(EVENT_CF) print(paste('new_thread_col: ',new_thread_col)) - print(paste('new_event_col: ', new_event_col)) + # print(paste('new_event_col: ', new_event_col)) # Add names for the new columns, if necessary if (!(new_thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CF,new_thread_col) } - if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_col) } +# if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_col) } # ThreadNet code assumes these columns will be there, so we need to add them - new_occ_VR$threadNum = integer(nrow(new_occ_VR)) - new_occ_VR$seqNum = integer(nrow(new_occ_VR)) + occ$threadNum = integer(nrow(occ)) + occ$seqNum = integer(nrow(occ)) - # Special case for Visit_ID, which is already filled in URMC EMR data - if (THREAD_CF == 'VISIT_ID') { - occ$threadNum = occ$Visit_ID - occ$seqNum = occ$seqn - - # say how many there are... - print(paste('Number of threads (visits) in this POV: ', length(unique( occ$Visit_ID )))) - - # Now sort the data set by the new threadNum and tStamp - occ = occ[order(occ[[new_thread_col]],occ$tStamp),] - - } - else { # Now sort the data set by the new threadNum and tStamp occ = occ[order(occ[[new_thread_col]],occ$tStamp),] @@ -175,23 +177,16 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ } } - } - - ### This is URMC specific: For all POV combinations, add the clinic and clinic_day - - # get the date only from the timestamp - occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d") - - # make new columns for clinic + day and events - occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') - # Now save the result for later and return it, as well. - save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) +# save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) + save(occ, file=paste0(paste(fname,new_thread_col,sep='+'), '.Rdata')) + print(paste('Saved threaded occurrences: ', nrow(occ))) return(occ) - +} + ############ extra stuff not used ############# # # split occ data frame by threadNum to find earliest time value for that thread @@ -203,360 +198,7 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){ # occ= data.frame(do.call(rbind, occ_split)) -} - - - - -################################################################### -################################################################### -ACHR_batch_clinic_days <- function(occ,TN, CFs) { - - # Name for column that has events -- three variations - DV1= newColName(CFs[1]) - DV2= newColName(CFs[1:2]) - DV3= newColName(CFs[1:3]) - - print(DV1) - print(DV2) - print(DV3) - - # name for column that defines the buckets - BU = 'Clinic_ymd' - - # first get the date only -# occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d") - - # make new columns as needed for clinic + day and events -# occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') - - - - # pick subsets -- one visit at a time in this version, but could be more - bucket_list <- unique(occ[['Clinic_ymd']]) - - # get the size (number of buckets) - N = length(bucket_list) - print(N) - - # pre-allocate the data.table. Tables are supposed to be faster. - ACHR = data.table(bucket=integer(N), - Clinic_date = character(N), - YMD_date = character(N), - NEvents = integer(N), - NumVisits = numeric(N), - A_NetComplexity=double(N), - AR_NetComplexity=double(N), - ARW_NetComplexity=double(N), - A_Nodes=double(N), - AR_Nodes=double(N), - ARW_Nodes=double(N), - A_Edges=double(N), - AR_Edges=double(N), - ARW_Edges=double(N), - A_CompressRatio = double(N), - AR_CompressRatio = double(N), - ARW_CompressRatio = double(N), - A_Entropy = double(N), - AR_Entropy = double(N), - ARW_Entropy = double(N), - Clinic = character(N), - NumUniqueProcedures = numeric(N), - NumUniqueDiagnosisGroups = numeric(N), - NumPhysicians = numeric(N), - Weekday = character(N), - Month = character(N) - ) - - # Now add columns for the IVs. There will be three for each IV - - # Add the IV columns - for (cf in CFs){ - - ACHR[, paste0(cf,"_count"):= double(N)] - ACHR[, paste0(cf,"_compression"):= double(N)] - ACHR[, paste0(cf,"_entropy"):= double(N)] - - } - - # loop through the buckets. Result will be data frame with one row per bucket - for (i in 1:N){ - - b = i # as.integer(bucket_list[i]) - - # print once every 10 buckets - if (b%%10==0) {print(b)} - - # select the threads that go in this bucket - df = occ[occ[[BU]] ==bucket_list[i],] - - # bucket number - ACHR[b,bucket := b] - - # length of the thread (number of rows) - ACHR[b,NEvents := nrow(df)] - - # only do the computations if there are more than two occurrences - if (nrow(df) > 2) { - - # compute the duration of the visit in hours - # ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] - - # compressibility of DV - ACHR[b,A_CompressRatio := compression_index(df,DV1)] - ACHR[b,AR_CompressRatio := compression_index(df,DV2)] - ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] - - # NetComplexity of DV - # First get the network - n = threads_to_network_original(df,TN, DV1) - ACHR[b,A_NetComplexity := estimate_network_complexity( n )] - ACHR[b,A_Nodes := nrow(n$nodeDF) ] - ACHR[b,A_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] - - # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] - - - # compute stuff on each context factor - for (cf in CFs){ - - # Count the unique elements in each cf - ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] - - # get the compression - ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] - - # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] - - } - } # kf nrows > 2 - - # Now copy in the rest of data - # this works because one visit is one bucket - # count the number of different diagnoses --> typical indicators of complexity - - - ACHR[b,'Clinic_date' := df[1,'Clinic_ymd']] - ACHR[b,'YMD_date' := df[1,'ymd']] - ACHR[b,'NumVisits' := length(unique(df[['Visit_ID']]))] - ACHR[b,'Clinic' := df[1,'Clinic']] - ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))] - ACHR[b,'NumUniqueDiagnosisGroups' := length(unique(df[['Diagnosis_Group']]))] - ACHR[b,'NumPhysicians' := length(unique(df[['Physician']]))] - ACHR[b,'Weekday' := df[1,'Weekday']] - ACHR[b,'Month' := df[1,'Month']] - - - - } # loop thru buckets - - # copy this one for consistency - ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] - - # return the table - return(ACHR) -} - -################################################################### -################################################################### - - -# ACHR stands for Antecedents of Complexity in Healthcare Routines. -# This is function is set up to compute process parameters on thousands of patient visits. -ACHR_batch_visits <- function(occ,TN, CFs) { - - DV1= newColName(CFs[1]) - DV2= newColName(CFs[1:2]) - DV3= newColName(CFs[1:3]) - print(DV1) - print(DV2) - print(DV3) - -# pick subsets -- one visit at a time in this version, but could be more -bucket_list <- make_buckets_1(occ, 'Visit_ID') - -# get the size (number of buckets) -N = length(bucket_list) -print(bucket_list) - - -# pre-allocate the data.table. Tables are supposed to be faster. -ACHR = data.table(bucket=integer(N), - NEvents = integer(N), - VisitStart= character(N), - VisitStartInt = integer(N), - VisitDuration=double(N), - A_NetComplexity=double(N), - AR_NetComplexity=double(N), - ARW_NetComplexity=double(N), - A_Nodes=double(N), - AR_Nodes=double(N), - ARW_Nodes=double(N), - A_Edges=double(N), - AR_Edges=double(N), - ARW_Edges=double(N), - A_CompressRatio = double(N), - AR_CompressRatio = double(N), - ARW_CompressRatio = double(N), - A_Entropy = double(N), - AR_Entropy = double(N), - ARW_Entropy = double(N), - Visit_ID = character(N), - Subject_ID = character(N), - Clinic = character(N), - Procedure = character(N), - Diagnosis = character(N), # diag in the raw data - Diagnosis_group = character(N), - Physician = character(N), - Weekday = character(N), - Month = character(N) - ) - -# Now add columns for the IVs. There will be three for each IV - -# Add the IV columns -for (cf in CFs){ - - ACHR[, paste0(cf,"_count"):= double(N)] - ACHR[, paste0(cf,"_compression"):= double(N)] - ACHR[, paste0(cf,"_entropy"):= double(N)] -} - -# loop through the buckets. Result will be data frame with one row per bucket -for (i in 1:N){ - - b = i # as.integer(bucket_list[i]) - - # print once every 100 visits - if (b%%100==0) {print(b)} - # select the threads that go in this bucket - df = occ[occ[[TN]] ==bucket_list[i],] - - # bucket number - ACHR[b,bucket := b] - - # length of the thread (number of rows) - ACHR[b,NEvents := nrow(df)] - - # only do the computations if there are more than two occurrences - if (nrow(df) > 2) { - - # compute the duration of the visit in hours - ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] - - # compressibility of DV - ACHR[b,A_CompressRatio := compression_index(df,DV1)] - ACHR[b,AR_CompressRatio := compression_index(df,DV2)] - ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] - - # NetComplexity of DV - # First get the network - # NetComplexity of DV - # First get the network - n = threads_to_network_original(df,TN, DV1) - ACHR[b,A_NetComplexity := estimate_network_complexity( n )] - ACHR[b,A_Nodes := nrow(n$nodeDF) ] - ACHR[b,A_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] - - # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] - - - # compute stuff on each context factor - for (cf in CFs){ - - # Count the unique elements in each cf - ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] - - # get the compression - ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] - - # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] - - } -} # kf nrows > 2 - - # Now copy in the rest of data - # this only works because one visit is one bucket - - # ACHR[b,'VisitStart' := as.POSIXct( df[1,'tStamp']) ] - ACHR[b,'VisitStartInt' := df[1,'tStamp'] ] - - ACHR[b,'VisitStart' := as.character( df[1,'tStamp'] ) ] - - ACHR[b,'Visit_ID' := df[1,'Visit_ID']] - ACHR[b,'Subject_ID' := df[1,'Subject_ID']] - ACHR[b,'Clinic' := df[1,'Clinic']] - ACHR[b,'Procedure' := df[1,'Proc']] - ACHR[b,'Diagnosis' := df[1,'Diag']] - ACHR[b,'Diagnosis_group' := df[1,'Diagnosis_Group']] - ACHR[b,'Physician' := df[1,'Physician']] - ACHR[b,'Weekday' := df[1,'Weekday']] - ACHR[b,'Month' := df[1,'Month']] - - - -} # loop thru buckets - -# copy this one for consistency -ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] - -# return the table -return(ACHR) -} - - -# Each bucket is a list of thread numbers that can be used to subset the list of occurrences -make_buckets_1 <- function(o, criteria){ - - return( unique(o[[criteria]]) ) - -} - - - - - -# -# make_box_plots <- function(){ -# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity", -# color = "VisitDay", -# ylab = "Complexity", xlab = "Month (DRH)") -# } -# - - - - - -get_timeScale <- function(){'hr'} - diff --git a/Data Prep/ACHR_Batch_V2.R b/Data Prep/ACHR_Batch_V2.R index 730f0b5..818ef08 100644 --- a/Data Prep/ACHR_Batch_V2.R +++ b/Data Prep/ACHR_Batch_V2.R @@ -35,7 +35,7 @@ # this function reads the raw data from URMC and creates files for clinic_day and visits -read_ACHR_data <- function(){ +read_ACHR_data_old <- function(){ library(tidyr) library(data.table) @@ -48,7 +48,9 @@ read_ACHR_data <- function(){ # This code is tailored for reading in new data from URMC, October 2018 # read the file into data frame - d <<- fread('auditfinal_10022018.csv') + d <<- fread('auditfinal_10022018.csv') + + # d <<- fread('10test.csv') # Sort and convert to data.table vt <<-data.table(arrange(d,desc(Visit_ID,asc(Timestamps)))) @@ -58,7 +60,7 @@ read_ACHR_data <- function(){ cnx = cn[c(13,1:12,14:15)] vt=setcolorder(vt,cnx) setnames(vt,'Timestamps','tStamp') - setnames(vt,'X','seqn') + # setnames(vt,'X','seqn') # make a dataframe copy just for fun vdf <<- as.data.frame(vt) From d1f3c1e13e67d2db370c2fe120dbea5ccf80e490 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Tue, 20 Nov 2018 12:45:14 -0500 Subject: [PATCH 07/31] Converted to sapply for main loop --- .gitignore | 4 ++++ Data Prep/ACHR_Batch_PreProcess.R | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 62fdb39..265a87a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,7 @@ rsconnect/ >>>>>>> be0331537859f5addf650269855aa0a9d49ca5d5 .Rproj.user emrt+Visit_ID+Role_Action.Rdata +Thrds+threadNum+Action.Rdata +Threads+threadNum+Action.Rdata +Threads+threadNum+Action_Role.Rdata +emrt+Visit_ID+Role_Action_Workstation.Rdata diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index ef43e23..ffa0a38 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -189,13 +189,13 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ ############ extra stuff not used ############# - # # split occ data frame by threadNum to find earliest time value for that thread - # # then substract that from initiated relativeTime from above - # occ_split = lapply(split(occ, occ$threadNum), - # function(x) {x$relativeTime = difftime(x$relativeTime, min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x}) - # - # # # row bind data frame back together - # occ= data.frame(do.call(rbind, occ_split)) + # split occ data frame by threadNum to find earliest time value for that thread + # then substract that from initiated relativeTime from above + occ_split = lapply(split(occ, occ$threadNum), + function(x) {x$relativeTime = difftime(x$relativeTime, min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x}) + + # # row bind data frame back together + occ= data.frame(do.call(rbind, occ_split)) From 84d677a8cb83de686fe87a4f307dfd8190d9af52 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 22 Nov 2018 19:34:41 -0500 Subject: [PATCH 08/31] Added comments on work needed to get correct visit-role data --- .gitignore | 2 ++ Data Prep/ACHR_Batch_PreProcess.R | 18 +++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 265a87a..7cc7327 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,5 @@ Thrds+threadNum+Action.Rdata Threads+threadNum+Action.Rdata Threads+threadNum+Action_Role.Rdata emrt+Visit_ID+Role_Action_Workstation.Rdata +emr+Visit_ID.Rdata +Thrds+threadNum+Action_Role_Workstation.Rdata diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index ffa0a38..3168d8b 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -143,10 +143,14 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ occ$threadNum = integer(nrow(occ)) occ$seqNum = integer(nrow(occ)) - + #_#_#_#_#_#_ I think what we need to do to get the correct visit_role is sort by visit and time stamp + #_#_#_#_#_#_ But then SPLIT the visit according to visit_role or visit_workstation or whatever # Now sort the data set by the new threadNum and tStamp occ = occ[order(occ[[new_thread_col]],occ$tStamp),] + + #_#_#_#_#_#_ This will be incorrect if the same role is occurs more than once in a single visit. + #_#_#_#_#_#_ Need to break it out according to the changes in role within a visit... # get the list of unique identifies for the threads. The length of this list is the number of threads pov_list = unique(occ[[new_thread_col]]) print(paste('Number of threads in this POV: ', length(pov_list))) @@ -190,12 +194,12 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ ############ extra stuff not used ############# # split occ data frame by threadNum to find earliest time value for that thread - # then substract that from initiated relativeTime from above - occ_split = lapply(split(occ, occ$threadNum), - function(x) {x$relativeTime = difftime(x$relativeTime, min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x}) - - # # row bind data frame back together - occ= data.frame(do.call(rbind, occ_split)) + # # then substract that from initiated relativeTime from above + # occ_split = lapply(split(occ, occ$threadNum), + # function(x) {x$relativeTime = difftime(x$relativeTime, min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x}) + # + # # # row bind data frame back together + # occ= data.frame(do.call(rbind, occ_split)) From 21a5ffd06f4ce71d7558c9d137f624202de89bca Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 22 Nov 2018 19:37:30 -0500 Subject: [PATCH 09/31] Added new functions for aggregating data --- Data Prep/.gitignore | 1 - Data Prep/ACHR_Batch_Aggregate_Data.R | 495 ++++++++++++++++++++++++++ 2 files changed, 495 insertions(+), 1 deletion(-) create mode 100644 Data Prep/ACHR_Batch_Aggregate_Data.R diff --git a/Data Prep/.gitignore b/Data Prep/.gitignore index 8ccf702..e69de29 100644 --- a/Data Prep/.gitignore +++ b/Data Prep/.gitignore @@ -1 +0,0 @@ -ACHR_Batch_Aggregate_Data.R diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R new file mode 100644 index 0000000..a8ff292 --- /dev/null +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -0,0 +1,495 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets +# +# ACHR_Batch_Aggregate_Data +# +# (c) 2018 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Brian Pentland +# Absolutely no warranty! +########################################################################################################## + + +# Two ways to aggregate the occurrences +# 1) By thread (usually visit) +# 2) By Clinic_day + + + + +################################################################### +################################################################### +# This is function is set up to aggregate the occurrences within threads -- typically visits +# occ = pre-processed threaded occurrences +# TN = threadNum in most cases +# CFs can be chosen -- they don't have to match the POV + +ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + + # don't remember why this is here... + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + # get the list of buckets + bucket_list <- make_buckets_1(occ, TN) + + # make data frame with results + Thrds = data.frame( t(sapply( bucket_list, + function(b){ + + # select a subset + df = occ[ occ[[TN]] ==b , ] + + # Only run for visits with more than two occurrences + + # get the network + n = threads_to_network_original(df,TN, new_event_col) + + # compute each parameter and put them in a vector + c( + bucket=b, + NEvents = nrow(df), + ThreadStart= as.character( df[1,'tStamp'] ), + ThreadStartInt = df[1,'tStamp'], + ThreadDuration= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ), + NetComplexity=estimate_network_complexity( n ), + Nodes=nrow(n$nodeDF), + Edges=nrow(n$edgeDF), + CompressRatio = compression_index(df,new_event_col), + Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), + NumProcedures = count_procedures(df$Proc[1]), + NumDiagnoses = count_diagnoses(df$Diag[1]), + wait_time = compute_wait_time(df), + # Visit_number = integer(N), + # LOC_CPT = character(N), + Visit_ID = df[1,'Visit_ID'], + Subject_ID = df[1,'Subject_ID'], + Clinic = df[1,'Clinic'], + ymd = df[1,'ymd'], + Clinic_ymd = df[1,'Clinic_ymd'], + Proc = df[1,'Proc'], + Diagnosis = df[1,'Diag'], # diag in the raw data + Diagnosis_group = df[1,'Diagnosis_Group'], + Physician = df[1,'Physician'], + Weekday = df[1,'Weekday'], + Month = df[1,'Month'], + Phase = compute_phase(df$tStamp[1]), + CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ), + + # need to figure out how to name these + sapply(ALL_CFs, function(cf){ + c( + assign( paste0(cf,"_count"), length(unique(df[[cf]]))), + assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) + ) }) + + ) + + } ))) + + save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')) + + return(Thrds) + +} + +# Each bucket is a list of thread numbers that can be used to subset the list of occurrences +make_buckets_1 <- function(o, criteria){ + + return( unique(o[[criteria]]) ) + +} + +# Need to write all these functions +compute_phase <- function(t){ + + t=as.Date(t) + + if ( t >= '2016-01-01' & t < '2016-06-07') return('one') + if ( t >= '2016-06-07' & t < '2016-09-01') return('two') + if ( t >= '2016-09-01' & t < '2017-04-15') return('three') + if ( t >= '2017-04-15' & t < '2017-09-01') return('four') + if ( t >= '2017-09-01' & t < '2017-12-31') return('five') + + # else return NA + return(NA) + +} + +# alignment is a measure of how much the number of nodes is increased by adding context factors +# always base this on the "Action" -- how many more "actions" are there? +# could also do this based on entropy, or write the function to use the values that were already computed +compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){ + + # first get numerator + n= length(unique(df$Action)) + + # Now get the total number with all of the CFs used to define the events + m= length(unique( df[[newColName(EVENT_CFs)]])) + + return(n/m) +} + + +compute_wait_time <- function(df){ + + # find the first occurrence of the chief complain. Seems to correspond closely to wait time + # Make sure it's not in the first 5 occurrences, and make sure to return a value of at least 1 if it never occurs + w= grep('MR_VN_CHIEF_COMPLAINT',df$Action)[1] + + return( difftime( lubridate::ymd_hms(df$tStamp[w]),lubridate::ymd_hms(df$tStamp[1]), units='hours' ) ) + +} + +count_procedures <- function( p ){ + + # get the overall number of items + total_num=length(grep('#@#', p )) + + # adjust for the visit codes that are not actual procedures + v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205') + + # just look at the first occurrence in the df + num_visit_codes = sum(sapply(v,function(x){length(grep( x, p ))})) + + # limit to non-negative + total_num = max(0, total_num - num_visit_codes) + + return( total_num ) +} + +# This one is easy +count_diagnoses <- function(d){ + return(length(grep('#@#', d ))) +} + + +## OLD ### +# ACHR_batch_threads_old <- function(occ,TN, EVENT_CFs, ALL_CFs) { +# +# library(tidyr) +# library(dplyr) +# library(ThreadNet) +# library(ngram) +# library(lubridate) +# library(stringr) +# +# new_event_col = newColName(EVENT_CFs) +# if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } +# +# # pick subsets --here we aggregate one thread at a time +# bucket_list <- make_buckets_1(occ, TN) +# +# # get the size (number of buckets) +# N = length(bucket_list) +# c +# # pre-allocate the result +# ACHR = data.table(bucket=integer(N), +# NEvents = integer(N), +# ThreadStart= character(N), +# ThreadStartInt = integer(N), +# ThreadDuration =double(N), +# NetComplexity=double(N), +# Nodes=double(N), +# Edges=double(N), +# CompressRatio = double(N), +# Entropy = double(N), +# NumProcedures = double(N), +# NumDiagnoses = double(N), +# wait_time = double(N), +# Visit_number = integer(N), +# LOC_CPT = character(N), +# Visit_ID = character(N), +# Subject_ID = character(N), +# Clinic = character(N), +# ymd = character(N), +# Clinic_ymd = character(N), +# Procedure = character(N), +# Diagnosis = character(N), # diag in the raw data +# Diagnosis_group = character(N), +# Physician = character(N), +# Weekday = character(N), +# Month = character(N), +# Phase = character(N), +# CF_Alignment = double(N) +# ) +# +# # Now add columns for the CFs. There will be two for each CF +# for (cf in ALL_CFs){ +# ACHR[, paste0(cf,"_count"):= double(N)] +# ACHR[, paste0(cf,"_entropy"):= double(N)] +# } +# +# # loop through the buckets. Result will be data frame with one row per bucket +# for (i in 1:N){ +# +# b = i # as.integer(bucket_list[i]) +# +# # print once every 100 visits +# if (b%%100==0) {print(b)} +# +# # select the threads that go in this bucket +# df = occ[occ[[TN]] ==bucket_list[i],] +# +# # bucket number +# ACHR[b,bucket := b] +# +# # length of the thread (number of rows) +# ACHR[b,NEvents := nrow(df)] +# +# # only do the computations if there are more than two occurrences +# if (nrow(df) > 2) { +# +# # compute the duration of the visit in hours +# ACHR[b,ThreadDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] +# +# # compressibility of DV +# ACHR[b,CompressRatio := compression_index(df,new_event_col)] +# # ACHR[b,AR_CompressRatio := compression_index(df,DV2)] +# # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] +# +# # NetComplexity of DV +# # First get the network +# # NetComplexity of DV +# # First get the network +# n = threads_to_network_original(df,TN, new_event_col) +# ACHR[b,NetComplexity := estimate_network_complexity( n )] +# ACHR[b,Nodes := nrow(n$nodeDF) ] +# ACHR[b,Edges := nrow(n$edgeDF) ] +# +# +# # compute stuff on each context factor +# for (cf in ALL_CFs){ +# +# # Count the unique elements in each cf +# ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] +# +# # get the entropy +# ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] +# +# } +# } # kf nrows > 2 +# +# # Now copy in the rest of data +# # this only works because one visit is one bucket +# +# # ACHR[b,'VisitStart' := as.POSIXct( df[1,'tStamp']) ] +# ACHR[b,'ThreadStartInt' := df[1,'tStamp'] ] +# +# ACHR[b,'ThreadStart' := as.character( df[1,'tStamp'] ) ] +# +# ACHR[b,'Visit_ID' := df[1,'Visit_ID']] +# ACHR[b,'Subject_ID' := df[1,'Subject_ID']] +# ACHR[b,'Clinic' := df[1,'Clinic']] +# ACHR[b,'Procedure' := df[1,'Proc']] +# ACHR[b,'Diagnosis' := df[1,'Diag']] +# ACHR[b,'Diagnosis_group' := df[1,'Diagnosis_Group']] +# ACHR[b,'Physician' := df[1,'Physician']] +# ACHR[b,'Weekday' := df[1,'Weekday']] +# ACHR[b,'Month' := df[1,'Month']] +# +# +# +# } # loop thru buckets +# +# +# save(ACHR, file=paste0(paste('Threads',TN,new_event_col,sep='+'), '.Rdata')) +# +# +# # return the table +# return(ACHR) +# } + + + + + +################################################################### +################################################################### +ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) { + + # Name for column that has events -- three variations + DV1= newColName(CFs[1]) + DV2= newColName(CFs[1:2]) + DV3= newColName(CFs[1:3]) + + print(DV1) + print(DV2) + print(DV3) + + # name for column that defines the buckets + BU = 'Clinic_ymd' + + # first get the date only + # occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d") + + # make new columns as needed for clinic + day and events + # occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') + + + + # pick subsets -- one visit at a time in this version, but could be more + bucket_list <- unique(occ[['Clinic_ymd']]) + + # get the size (number of buckets) + N = length(bucket_list) + print(N) + + # pre-allocate the data.table. Tables are supposed to be faster. + ACHR = data.table(bucket=integer(N), + Clinic_date = character(N), + YMD_date = character(N), + NEvents = integer(N), + NumVisits = numeric(N), + A_NetComplexity=double(N), + AR_NetComplexity=double(N), + ARW_NetComplexity=double(N), + A_Nodes=double(N), + AR_Nodes=double(N), + ARW_Nodes=double(N), + A_Edges=double(N), + AR_Edges=double(N), + ARW_Edges=double(N), + A_CompressRatio = double(N), + AR_CompressRatio = double(N), + ARW_CompressRatio = double(N), + A_Entropy = double(N), + AR_Entropy = double(N), + ARW_Entropy = double(N), + Clinic = character(N), + NumUniqueProcedures = numeric(N), + NumUniqueDiagnosisGroups = numeric(N), + NumPhysicians = numeric(N), + Weekday = character(N), + Month = character(N) + ) + + # Now add columns for the IVs. There will be three for each IV + + # Add the IV columns + for (cf in CFs){ + + ACHR[, paste0(cf,"_count"):= double(N)] + ACHR[, paste0(cf,"_compression"):= double(N)] + ACHR[, paste0(cf,"_entropy"):= double(N)] + + } + + # loop through the buckets. Result will be data frame with one row per bucket + for (i in 1:N){ + + b = i # as.integer(bucket_list[i]) + + # print once every 10 buckets + if (b%%10==0) {print(b)} + + # select the threads that go in this bucket + df = occ[occ[[BU]] ==bucket_list[i],] + + # bucket number + ACHR[b,bucket := b] + + # length of the thread (number of rows) + ACHR[b,NEvents := nrow(df)] + + # only do the computations if there are more than two occurrences + if (nrow(df) > 2) { + + # compute the duration of the visit in hours + # ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] + + # compressibility of DV + ACHR[b,A_CompressRatio := compression_index(df,DV1)] + ACHR[b,AR_CompressRatio := compression_index(df,DV2)] + ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] + + # NetComplexity of DV + # First get the network + n = threads_to_network_original(df,TN, DV1) + ACHR[b,A_NetComplexity := estimate_network_complexity( n )] + ACHR[b,A_Nodes := nrow(n$nodeDF) ] + ACHR[b,A_Edges := nrow(n$edgeDF) ] + + n = threads_to_network_original(df,TN, DV2) + ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + ACHR[b,AR_Edges := nrow(n$edgeDF) ] + + n = threads_to_network_original(df,TN, DV3) + ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + + # get the entropy for AR and ARW + ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + + + # compute stuff on each context factor + for (cf in CFs){ + + # Count the unique elements in each cf + ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] + + # get the compression + ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] + + # get the entropy + ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + + } + } # kf nrows > 2 + + # Now copy in the rest of data + # this works because one visit is one bucket + # count the number of different diagnoses --> typical indicators of complexity + + + ACHR[b,'Clinic_date' := df[1,'Clinic_ymd']] + ACHR[b,'YMD_date' := df[1,'ymd']] + ACHR[b,'NumVisits' := length(unique(df[['Visit_ID']]))] + ACHR[b,'Clinic' := df[1,'Clinic']] + ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))] + ACHR[b,'NumUniqueDiagnosisGroups' := length(unique(df[['Diagnosis_Group']]))] + ACHR[b,'NumPhysicians' := length(unique(df[['Physician']]))] + ACHR[b,'Weekday' := df[1,'Weekday']] + ACHR[b,'Month' := df[1,'Month']] + + + + } # loop thru buckets + + # copy this one for consistency + ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] + + # return the table + return(ACHR) +} + +################################################################### +################################################################### + + + + +# +# make_box_plots <- function(){ +# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity", +# color = "VisitDay", +# ylab = "Complexity", xlab = "Month (DRH)") +# } +# + + + + + +get_timeScale <- function(){'hr'} + From 1db94381010a8a2a928a6c87258a44f727569b5b Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Fri, 23 Nov 2018 09:43:50 -0500 Subject: [PATCH 10/31] cleaning up the aggregation functions --- Data Prep/ACHR_Batch_Aggregate_Data.R | 86 +++++++++++++++++++-------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index a8ff292..cb13f50 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -24,6 +24,8 @@ # TN = threadNum in most cases # CFs can be chosen -- they don't have to match the POV +# need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. + ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { library(tidyr) @@ -33,9 +35,12 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { library(lubridate) library(stringr) - # don't remember why this is here... + # Add columns for combinations of CFs if needed new_event_col = newColName(EVENT_CFs) if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + all_cf_col = newColName(ALL_CFs) + if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } # get the list of buckets bucket_list <- make_buckets_1(occ, TN) @@ -49,16 +54,31 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # Only run for visits with more than two occurrences - # get the network - n = threads_to_network_original(df,TN, new_event_col) + # get the network -- only if there are enough rows... + + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) # compute each parameter and put them in a vector c( bucket=b, + Clinic = df[1,'Clinic'], + Physician = as.character(df[1,'Physician']), + ymd = df[1,'ymd'], + Clinic_ymd = df[1,'Clinic_ymd'], + Weekday = df[1,'Weekday'], + Month = df[1,'Month'], + Phase = compute_phase(df$tStamp[1]), + threadNum = df[1,'threadNum'], + Visit_ID = df[1,'Visit_ID'], + Subject_ID = df[1,'Subject_ID'], NEvents = nrow(df), ThreadStart= as.character( df[1,'tStamp'] ), ThreadStartInt = df[1,'tStamp'], ThreadDuration= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ), + wait_time = compute_wait_time(df), + # Visit_number = integer(N), + # LOC_CPT = character(N), NetComplexity=estimate_network_complexity( n ), Nodes=nrow(n$nodeDF), Edges=nrow(n$edgeDF), @@ -66,34 +86,43 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), NumProcedures = count_procedures(df$Proc[1]), NumDiagnoses = count_diagnoses(df$Diag[1]), - wait_time = compute_wait_time(df), - # Visit_number = integer(N), - # LOC_CPT = character(N), - Visit_ID = df[1,'Visit_ID'], - Subject_ID = df[1,'Subject_ID'], - Clinic = df[1,'Clinic'], - ymd = df[1,'ymd'], - Clinic_ymd = df[1,'Clinic_ymd'], - Proc = df[1,'Proc'], - Diagnosis = df[1,'Diag'], # diag in the raw data - Diagnosis_group = df[1,'Diagnosis_Group'], - Physician = df[1,'Physician'], - Weekday = df[1,'Weekday'], - Month = df[1,'Month'], - Phase = compute_phase(df$tStamp[1]), - CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ), + Proc = as.character(df[1,'Proc']), + Diagnosis = as.character(df[1,'Diag']), + Diagnosis_group = df[1,'Diagnosis_Group'], + CF_Alignment = 1, + # CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ), + ALL_CF_count = length(unique(df[[all_cf_col]])), + ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), - # need to figure out how to name these + # name these afterwareds sapply(ALL_CFs, function(cf){ c( - assign( paste0(cf,"_count"), length(unique(df[[cf]]))), - assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) + length(unique(df[[cf]])), + compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) + + # assign( paste0(cf,"_count"), length(unique(df[[cf]]))), + # assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) ) }) ) } ))) + + # name the last columns -- code has to match above + cn = as.vector(sapply(ALL_CFs, function(cf){ + c( paste0(cf,"_count"), + paste0(cf,"_entropy") ) })) + + # now assign them to the last columns + last_col = ncol(Thrds) + first_col= last_col-length(cn)+1 + setnames(Thrds, c(first_col:last_col), cn) + + # Computer the alignment of the context factors + Thrds$CF_Alignment = as.numeric( Thrds$Action_count) / as.numeric( Thrds$ALL_CF_count ) + + save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')) return(Thrds) @@ -131,7 +160,7 @@ compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){ # first get numerator n= length(unique(df$Action)) - # Now get the total number with all of the CFs used to define the events + # Now get the total number with all of the CFs m= length(unique( df[[newColName(EVENT_CFs)]])) return(n/m) @@ -150,14 +179,17 @@ compute_wait_time <- function(df){ count_procedures <- function( p ){ + # if no procedures, return zero + if (is.na( p )) return( 0 ) + # get the overall number of items - total_num=length(grep('#@#', p )) + total_num = str_count( p, '#@#') # adjust for the visit codes that are not actual procedures v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205') # just look at the first occurrence in the df - num_visit_codes = sum(sapply(v,function(x){length(grep( x, p ))})) + num_visit_codes = sum(str_count( p, v )) # limit to non-negative total_num = max(0, total_num - num_visit_codes) @@ -167,7 +199,9 @@ count_procedures <- function( p ){ # This one is easy count_diagnoses <- function(d){ - return(length(grep('#@#', d ))) + + # Sometimes the marker is missing, so set the floor to one. There is always at least one. + return( max(1, str_count( d, '#@#' ))) } From 7dc9bbf2323552edb99efb389bf08976efac0351 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Fri, 23 Nov 2018 11:44:26 -0500 Subject: [PATCH 11/31] Fixed bug in alignment --- Data Prep/ACHR_Batch_Aggregate_Data.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index cb13f50..fb6a81a 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -34,6 +34,7 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { library(ngram) library(lubridate) library(stringr) + library(data.table) # Add columns for combinations of CFs if needed new_event_col = newColName(EVENT_CFs) @@ -119,8 +120,8 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { first_col= last_col-length(cn)+1 setnames(Thrds, c(first_col:last_col), cn) - # Computer the alignment of the context factors - Thrds$CF_Alignment = as.numeric( Thrds$Action_count) / as.numeric( Thrds$ALL_CF_count ) + # Compute the alignment of the context factors + Thrds$CF_Alignment = as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count )) save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')) From 5d544abe7f16c844b664309bb44364a0311327fd Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sat, 24 Nov 2018 04:01:57 -0500 Subject: [PATCH 12/31] Fixed bug n Visit_role -- and it's faster --- Data Prep/ACHR_Batch_Aggregate_Data.R | 20 +++------ Data Prep/ACHR_Batch_PreProcess.R | 60 ++++++++------------------- 2 files changed, 24 insertions(+), 56 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index fb6a81a..4b69919 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -90,27 +90,20 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { Proc = as.character(df[1,'Proc']), Diagnosis = as.character(df[1,'Diag']), Diagnosis_group = df[1,'Diagnosis_Group'], - CF_Alignment = 1, - # CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ), + CF_Alignment = 1, # make placeholder, but compute below ALL_CF_count = length(unique(df[[all_cf_col]])), ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), - # name these afterwareds + # name these columns afterwards sapply(ALL_CFs, function(cf){ - c( - length(unique(df[[cf]])), - compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) - - # assign( paste0(cf,"_count"), length(unique(df[[cf]]))), - # assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) - ) }) + c( length(unique(df[[cf]])), + compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ) }) ) - - } ))) + } ))) - # name the last columns -- code has to match above + # name the last columns -- code has to match above cn = as.vector(sapply(ALL_CFs, function(cf){ c( paste0(cf,"_count"), paste0(cf,"_entropy") ) })) @@ -122,7 +115,6 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # Compute the alignment of the context factors Thrds$CF_Alignment = as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count )) - save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')) diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index 3168d8b..ded690e 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -54,7 +54,7 @@ read_ACHR_data <- function(fname){ # read the file into data frame d <- fread( paste0(fname, '.csv') ) - # Sort by visit and timestamp + # Sort by visit and timestamps d <- arrange(d,desc(Visit_ID,asc(Timestamps))) ################################################################################## @@ -122,7 +122,6 @@ cleanOccBatch <- function(fileRows){ # ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group') # thread_CF = c('Visit_ID', 'Role', 'Workstation') # event_CF = c('Action','Role','Workstation') - # It returns the threaded set of occurrences and also saves it as Rdata. thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ @@ -130,60 +129,36 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ # these will be new column names new_thread_col = newColName(THREAD_CF) - # new_event_col = newColName(EVENT_CF) + # new_event_col = newColName(EVENT_CF) print(paste('new_thread_col: ',new_thread_col)) - # print(paste('new_event_col: ', new_event_col)) + # print(paste('new_event_col: ', new_event_col)) # Add names for the new columns, if necessary if (!(new_thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CF,new_thread_col) } -# if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_col) } + # if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_col) } # ThreadNet code assumes these columns will be there, so we need to add them occ$threadNum = integer(nrow(occ)) occ$seqNum = integer(nrow(occ)) - #_#_#_#_#_#_ I think what we need to do to get the correct visit_role is sort by visit and time stamp - #_#_#_#_#_#_ But then SPLIT the visit according to visit_role or visit_workstation or whatever - # Now sort the data set by the new threadNum and tStamp - occ = occ[order(occ[[new_thread_col]],occ$tStamp),] - + # Assume the data are sorted by visit_ID and tStamp -- the natural flow of the visit. + idx_list = which(occ[[new_thread_col]] !=dplyr::lag(occ[[new_thread_col]])) + idx_list = c(idx_list, nrow(occ)) + start_idx = c(1, head(idx_list,-1)) # add 1 to the front and drop the last + end_idx = idx_list - 1 # shift back by one - #_#_#_#_#_#_ This will be incorrect if the same role is occurs more than once in a single visit. - #_#_#_#_#_#_ Need to break it out according to the changes in role within a visit... - # get the list of unique identifies for the threads. The length of this list is the number of threads - pov_list = unique(occ[[new_thread_col]]) - print(paste('Number of threads in this POV: ', length(pov_list))) + print(paste('Number of threads in this POV: ', length(idx_list))) - start_row=1 - thrd=1 - for (p in pov_list){ - - # get the length of the thread - tlen = sum(occ[[new_thread_col]]==p) - - # guard against error - if (length(tlen)==0) tlen=0 - if (tlen>0){ - - #compute the index of the end row - end_row = start_row+tlen-1 - - # they all get the same thread number and incrementing seqNum - occ[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen)) - occ[start_row:end_row, "seqNum"] <- seq(tlen) - - # increment the counters for the next thread - start_row = end_row + 1 - thrd=thrd+1 - - if (thrd %% 1000 ==0) print(paste0('Thread count = ',thrd)) - + # sapply(seq(length(idx_list)), function(x) + for (x in seq(length(idx_list))) + { if (x %% 1000 ==0) print(paste0('Thread count = ',x)) + occ[start_idx[x]:end_idx[x],'threadNum'] = x + occ[start_idx[x]:end_idx[x],'seqNum'] = c(1:(end_idx[x]-start_idx[x]+1)) } - } - # Now save the result for later and return it, as well. -# save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) + # Now save the result for later and return it, as well. + # save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) save(occ, file=paste0(paste(fname,new_thread_col,sep='+'), '.Rdata')) print(paste('Saved threaded occurrences: ', nrow(occ))) @@ -192,6 +167,7 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ } + ############ extra stuff not used ############# # split occ data frame by threadNum to find earliest time value for that thread # # then substract that from initiated relativeTime from above From 809bd472aaee612dd63c19f10d5bdac4923a3225 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sun, 25 Nov 2018 12:56:23 -0500 Subject: [PATCH 13/31] cleaning up the trajectory code... --- Data Prep/ACHR_Batch_Aggregate_Data.R | 519 +++++++++++++++--------- Data Prep/ACHR_Batch_PreProcess.R | 2 + Data Prep/ACHR_Clinic_Trajectories_v3.R | 102 ++--- Tables and Figures/Clinic_day_plots.R | 7 +- 4 files changed, 386 insertions(+), 244 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index 4b69919..bad8ae5 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -10,12 +10,23 @@ ########################################################################################################## -# Two ways to aggregate the occurrences -# 1) By thread (usually visit) +# Two functions to aggregate the occurrences +# 1) By thread (usually visit, but could be visit_role) # 2) By Clinic_day +############################################################################################# +# Functions to make buckets +# Each bucket is a list of thread numbers that can be used to subset the list of occurrences +# Need be careful how you call thus because it can aggregate threads in unexpected ways +# For example, if you want Visit_ID_Role, use criteria = threadNum on the appropriately threaded input +make_buckets_1 <- function(o, criteria){ + + return( unique(o[[criteria]]) ) + +} + ################################################################### ################################################################### @@ -46,6 +57,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # get the list of buckets bucket_list <- make_buckets_1(occ, TN) + # print the number of buckets + print(paste0('Number of buckets=', length(bucket_list) )) + + # make data frame with results Thrds = data.frame( t(sapply( bucket_list, function(b){ @@ -73,6 +88,7 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { threadNum = df[1,'threadNum'], Visit_ID = df[1,'Visit_ID'], Subject_ID = df[1,'Subject_ID'], + Role_ID = as.character( df[1,'Role_ID'] ), NEvents = nrow(df), ThreadStart= as.character( df[1,'tStamp'] ), ThreadStartInt = df[1,'tStamp'], @@ -96,8 +112,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # name these columns afterwards sapply(ALL_CFs, function(cf){ - c( length(unique(df[[cf]])), - compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ) }) + c( length(unique(df[[cf]])) + # , + # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) + ) }) ) } ))) @@ -105,8 +123,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # name the last columns -- code has to match above cn = as.vector(sapply(ALL_CFs, function(cf){ - c( paste0(cf,"_count"), - paste0(cf,"_entropy") ) })) + c( paste0(cf,"_count") + # , + # paste0(cf,"_entropy") + ) })) # now assign them to the last columns last_col = ncol(Thrds) @@ -122,12 +142,11 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { } -# Each bucket is a list of thread numbers that can be used to subset the list of occurrences -make_buckets_1 <- function(o, criteria){ - - return( unique(o[[criteria]]) ) - -} + + +get_timeScale <- function(){'hr'} + + # Need to write all these functions compute_phase <- function(t){ @@ -148,18 +167,19 @@ compute_phase <- function(t){ # alignment is a measure of how much the number of nodes is increased by adding context factors # always base this on the "Action" -- how many more "actions" are there? # could also do this based on entropy, or write the function to use the values that were already computed -compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){ - - # first get numerator - n= length(unique(df$Action)) - - # Now get the total number with all of the CFs - m= length(unique( df[[newColName(EVENT_CFs)]])) - - return(n/m) -} +# compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){ +# +# # first get numerator +# n= length(unique(df$Action)) +# +# # Now get the total number with all of the CFs +# m= length(unique( df[[newColName(EVENT_CFs)]])) +# +# return(n/m) +# } +# NEEDS WORK!! Far too simplistic. compute_wait_time <- function(df){ # find the first occurrence of the chief complain. Seems to correspond closely to wait time @@ -170,6 +190,7 @@ compute_wait_time <- function(df){ } +# THIS ONE IS GOOD. count_procedures <- function( p ){ # if no procedures, return zero @@ -198,6 +219,129 @@ count_diagnoses <- function(d){ } + + + + +################################################################### +################################################################### +# This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days +# occ = pre-processed threaded occurrences +# TN = threadNum in most cases +# CFs can be chosen -- they don't have to match the POV + +# need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. + +ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + # make a list of unique buckets. Typical example + bucket_list <- unique(occ[[TN]]) + + # print the number of buckets + print(paste0('Number of buckets=', length(bucket_list) )) + + # Add columns for combinations of CFs if needed + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + all_cf_col = newColName(ALL_CFs) + if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + + + # make data frame with results + cds = data.frame( t(sapply( bucket_list, + function(b){ + + # select a subset of occurrences for the bucket + df = occ[ occ[[TN]] ==b , ] + + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and put them in a vector + c( + bucket=b, + Clinic = df[1,'Clinic'], + Physician = as.character(df[1,'Physician']), + ymd = df[1,'ymd'], + Clinic_ymd = df[1,'Clinic_ymd'], + Weekday = df[1,'Weekday'], + Month = df[1,'Month'], + Phase = compute_phase(df$tStamp[1]), + threadNum = df[1,'threadNum'], + Visit_ID = df[1,'Visit_ID'], + Subject_ID = df[1,'Subject_ID'], + NEvents = nrow(df), + bucketStart= as.character( df[1,'tStamp'] ), + bucketStartInt = df[1,'tStamp'], + bucketDuration= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ), + # wait_time = compute_wait_time(df), + # Visit_number = integer(N), + # LOC_CPT = character(N), + # NumUniqueProcedures = length(unique(df[['Proc']]) + NumVisits = length(unique(df[['Visit_ID']])), + NumUniqueDiagnosisGroups = length(unique(df[['Diagnosis_Group']])), + NumPhysicians = length(unique(df[['Physician']])), + TotalStaff = length(unique(df[['Role_ID']])), + NetComplexity=estimate_network_complexity( n ), + Nodes=nrow(n$nodeDF), + Edges=nrow(n$edgeDF), + CompressRatio = compression_index(df,new_event_col), + Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), + # NumProcedures = count_procedures(df$Proc[1]), + # NumDiagnoses = count_diagnoses(df$Diag[1]), + # Diagnosis_group = df[1,'Diagnosis_Group'], + CF_Alignment = 1, # make placeholder, but compute below + ALL_CF_count = length(unique(df[[all_cf_col]])), + ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), + + # name these columns afterwards + sapply(ALL_CFs, function(cf){ + c( length(unique(df[[cf]])) + # , + # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) + ) }) + + ) + } ))) + + + # name the last columns -- code has to match above + cn = as.vector(sapply(ALL_CFs, function(cf){ + c( paste0(cf,"_count") + # , + # paste0(cf,"_entropy") + ) })) + + # now assign them to the last columns + last_col = ncol(cds) + first_col= last_col-length(cn)+1 + setnames(cds, c(first_col:last_col), cn) + + # Compute the alignment of the context factors + cds$CF_Alignment = as.numeric( as.character(cds$Action_count)) / as.numeric( as.character(cds$ALL_CF_count )) + + save(cds, file=paste0(paste('cds',TN,new_event_col,sep='+'), '.Rdata')) + + return(cds) + +} + + +################################################################### +################################################################### +## OLD ### +## OLD ### ## OLD ### # ACHR_batch_threads_old <- function(occ,TN, EVENT_CFs, ALL_CFs) { # @@ -335,176 +479,167 @@ count_diagnoses <- function(d){ # } - - - ################################################################### ################################################################### -ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) { - - # Name for column that has events -- three variations - DV1= newColName(CFs[1]) - DV2= newColName(CFs[1:2]) - DV3= newColName(CFs[1:3]) - - print(DV1) - print(DV2) - print(DV3) - - # name for column that defines the buckets - BU = 'Clinic_ymd' - - # first get the date only - # occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d") - - # make new columns as needed for clinic + day and events - # occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') - - - - # pick subsets -- one visit at a time in this version, but could be more - bucket_list <- unique(occ[['Clinic_ymd']]) - - # get the size (number of buckets) - N = length(bucket_list) - print(N) - - # pre-allocate the data.table. Tables are supposed to be faster. - ACHR = data.table(bucket=integer(N), - Clinic_date = character(N), - YMD_date = character(N), - NEvents = integer(N), - NumVisits = numeric(N), - A_NetComplexity=double(N), - AR_NetComplexity=double(N), - ARW_NetComplexity=double(N), - A_Nodes=double(N), - AR_Nodes=double(N), - ARW_Nodes=double(N), - A_Edges=double(N), - AR_Edges=double(N), - ARW_Edges=double(N), - A_CompressRatio = double(N), - AR_CompressRatio = double(N), - ARW_CompressRatio = double(N), - A_Entropy = double(N), - AR_Entropy = double(N), - ARW_Entropy = double(N), - Clinic = character(N), - NumUniqueProcedures = numeric(N), - NumUniqueDiagnosisGroups = numeric(N), - NumPhysicians = numeric(N), - Weekday = character(N), - Month = character(N) - ) - - # Now add columns for the IVs. There will be three for each IV - - # Add the IV columns - for (cf in CFs){ - - ACHR[, paste0(cf,"_count"):= double(N)] - ACHR[, paste0(cf,"_compression"):= double(N)] - ACHR[, paste0(cf,"_entropy"):= double(N)] - - } - - # loop through the buckets. Result will be data frame with one row per bucket - for (i in 1:N){ - - b = i # as.integer(bucket_list[i]) - - # print once every 10 buckets - if (b%%10==0) {print(b)} - - # select the threads that go in this bucket - df = occ[occ[[BU]] ==bucket_list[i],] - - # bucket number - ACHR[b,bucket := b] - - # length of the thread (number of rows) - ACHR[b,NEvents := nrow(df)] - - # only do the computations if there are more than two occurrences - if (nrow(df) > 2) { - - # compute the duration of the visit in hours - # ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] - - # compressibility of DV - ACHR[b,A_CompressRatio := compression_index(df,DV1)] - ACHR[b,AR_CompressRatio := compression_index(df,DV2)] - ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] - - # NetComplexity of DV - # First get the network - n = threads_to_network_original(df,TN, DV1) - ACHR[b,A_NetComplexity := estimate_network_complexity( n )] - ACHR[b,A_Nodes := nrow(n$nodeDF) ] - ACHR[b,A_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] - - # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] - - - # compute stuff on each context factor - for (cf in CFs){ - - # Count the unique elements in each cf - ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] - - # get the compression - ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] - - # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] - - } - } # kf nrows > 2 - - # Now copy in the rest of data - # this works because one visit is one bucket - # count the number of different diagnoses --> typical indicators of complexity - - - ACHR[b,'Clinic_date' := df[1,'Clinic_ymd']] - ACHR[b,'YMD_date' := df[1,'ymd']] - ACHR[b,'NumVisits' := length(unique(df[['Visit_ID']]))] - ACHR[b,'Clinic' := df[1,'Clinic']] - ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))] - ACHR[b,'NumUniqueDiagnosisGroups' := length(unique(df[['Diagnosis_Group']]))] - ACHR[b,'NumPhysicians' := length(unique(df[['Physician']]))] - ACHR[b,'Weekday' := df[1,'Weekday']] - ACHR[b,'Month' := df[1,'Month']] - - - - } # loop thru buckets - - # copy this one for consistency - ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] - - # return the table - return(ACHR) -} - -################################################################### -################################################################### - - - +# ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) { +# +# # Name for column that has events -- three variations +# DV1= newColName(CFs[1]) +# DV2= newColName(CFs[1:2]) +# DV3= newColName(CFs[1:3]) +# +# print(DV1) +# print(DV2) +# print(DV3) +# +# # name for column that defines the buckets +# BU = 'Clinic_ymd' +# +# # first get the date only +# # occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d") +# +# # make new columns as needed for clinic + day and events +# # occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') +# +# +# +# # pick subsets -- one visit at a time in this version, but could be more +# bucket_list <- unique(occ[['Clinic_ymd']]) +# +# # get the size (number of buckets) +# N = length(bucket_list) +# print(N) +# +# # pre-allocate the data.table. Tables are supposed to be faster. +# ACHR = data.table(bucket=integer(N), +# Clinic_date = character(N), +# YMD_date = character(N), +# NEvents = integer(N), +# NumVisits = numeric(N), +# A_NetComplexity=double(N), +# AR_NetComplexity=double(N), +# ARW_NetComplexity=double(N), +# A_Nodes=double(N), +# AR_Nodes=double(N), +# ARW_Nodes=double(N), +# A_Edges=double(N), +# AR_Edges=double(N), +# ARW_Edges=double(N), +# A_CompressRatio = double(N), +# AR_CompressRatio = double(N), +# ARW_CompressRatio = double(N), +# A_Entropy = double(N), +# AR_Entropy = double(N), +# ARW_Entropy = double(N), +# Clinic = character(N), +# NumUniqueProcedures = numeric(N), +# NumUniqueDiagnosisGroups = numeric(N), +# NumPhysicians = numeric(N), +# Weekday = character(N), +# Month = character(N) +# ) +# +# # Now add columns for the IVs. There will be three for each IV +# +# # Add the IV columns +# for (cf in CFs){ +# +# ACHR[, paste0(cf,"_count"):= double(N)] +# ACHR[, paste0(cf,"_compression"):= double(N)] +# ACHR[, paste0(cf,"_entropy"):= double(N)] +# +# } +# +# # loop through the buckets. Result will be data frame with one row per bucket +# for (i in 1:N){ +# +# b = i # as.integer(bucket_list[i]) +# +# # print once every 10 buckets +# if (b%%10==0) {print(b)} +# +# # select the threads that go in this bucket +# df = occ[occ[[BU]] ==bucket_list[i],] +# +# # bucket number +# ACHR[b,bucket := b] +# +# # length of the thread (number of rows) +# ACHR[b,NEvents := nrow(df)] +# +# # only do the computations if there are more than two occurrences +# if (nrow(df) > 2) { +# +# # compute the duration of the visit in hours +# # ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] +# +# # compressibility of DV +# ACHR[b,A_CompressRatio := compression_index(df,DV1)] +# ACHR[b,AR_CompressRatio := compression_index(df,DV2)] +# ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] +# +# # NetComplexity of DV +# # First get the network +# n = threads_to_network_original(df,TN, DV1) +# ACHR[b,A_NetComplexity := estimate_network_complexity( n )] +# ACHR[b,A_Nodes := nrow(n$nodeDF) ] +# ACHR[b,A_Edges := nrow(n$edgeDF) ] +# +# n = threads_to_network_original(df,TN, DV2) +# ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] +# ACHR[b,AR_Nodes := nrow(n$nodeDF) ] +# ACHR[b,AR_Edges := nrow(n$edgeDF) ] +# +# n = threads_to_network_original(df,TN, DV3) +# ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] +# ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] +# ACHR[b,ARW_Edges := nrow(n$edgeDF) ] +# +# # get the entropy for AR and ARW +# ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] +# ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] +# +# +# # compute stuff on each context factor +# for (cf in CFs){ +# +# # Count the unique elements in each cf +# ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] +# +# # get the compression +# ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] +# +# # get the entropy +# ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] +# +# } +# } # kf nrows > 2 +# +# # Now copy in the rest of data +# # this works because one visit is one bucket +# # count the number of different diagnoses --> typical indicators of complexity +# +# +# ACHR[b,'Clinic_date' := df[1,'Clinic_ymd']] +# ACHR[b,'YMD_date' := df[1,'ymd']] +# ACHR[b,'NumVisits' := length(unique(df[['Visit_ID']]))] +# ACHR[b,'Clinic' := df[1,'Clinic']] +# ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))] +# ACHR[b,'NumUniqueDiagnosisGroups' := length(unique(df[['Diagnosis_Group']]))] +# ACHR[b,'NumPhysicians' := length(unique(df[['Physician']]))] +# ACHR[b,'Weekday' := df[1,'Weekday']] +# ACHR[b,'Month' := df[1,'Month']] +# +# +# +# } # loop thru buckets +# +# # copy this one for consistency +# ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] +# +# # return the table +# return(ACHR) +# } # # make_box_plots <- function(){ @@ -516,7 +651,3 @@ ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) { - - -get_timeScale <- function(){'hr'} - diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index ded690e..a639574 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -145,6 +145,8 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ # Assume the data are sorted by visit_ID and tStamp -- the natural flow of the visit. idx_list = which(occ[[new_thread_col]] !=dplyr::lag(occ[[new_thread_col]])) idx_list = c(idx_list, nrow(occ)) + + # creates two lists of indices so they match up correctly. start_idx = c(1, head(idx_list,-1)) # add 1 to the front and drop the last end_idx = idx_list - 1 # shift back by one diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 48525c5..5553868 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -22,15 +22,6 @@ library(philentropy) library(stringr) library(zoo) -# need function to count the number of procedures and number of diagnoses in the data. -# some visits have dozens of them -# need to pass in data frame that just has the column with the procedures or diagnoses -count_procs <- function(o){ - print(nrow(o)) - -unlist(lapply(1:nrow(o), function(i){ str_count(o[i], pattern='#@#') })) - -} # Use this to put the trajectory data with the clinic_day data merge_clinic_days_and_trajectories <- function(){ @@ -58,20 +49,23 @@ get_bucket <- function(o, b ){ # @param e data frame for POV -# @param blist is the bucket list. Each bucket is a "window" +# @param bucketCF is the column used to define the bucket list. Each bucket is a "window" # @param cf is the column that defines events (e.g. 'Action_Role') -# @param filter list contains the threshold levels for filtering out edges with low frequency: c(0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1) -graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_file_name) { +# @param filter list contains the threshold level for filtering out edges with low frequency. +# +# need to make sure this works with roles over time... +# +graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference_day=1, filter_list=0,save_file_name) { # get total size of possible matrix Max_Order = length(unique(e[[cf]]))^2 - # make data frame for results + # make data frame for results - bid is the bucket_ID vt=data.frame( ngrams=character(), freq=integer(), bid=integer() ) # here is the list of buckets - blist = unique(e[['Clinic_ymd']]) + blist = unique(e[[bucketCF]]) # take the underscore out so we can use it to strsplit below... and sort it blist=gsub('HH_POB','HHPOB',blist) @@ -91,8 +85,10 @@ graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_ # print(paste('b =',b)) # get text vector for the whole data set. Bucket needs at least 2 threads + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. th = get_bucket(e, b) - if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, 2)[1:2] } + if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } + nodes = length(unique(th[[cf]])) @@ -139,43 +135,53 @@ graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_ } - #Make a matrix to filter by where all the entries are 0 < f < 1 - fm = windowFreqMatrix/max(windowFreqMatrix) + # #Make a matrix to filter by where all the entries are 0 < f < 1 + # fm = windowFreqMatrix/max(windowFreqMatrix) + # + # # if the number if below the threshold, set it to zero + # windowFreqMatrix[fm<=f] = 0 + + df = data.frame( t(sapply( 1:(nWindows-1), + function(i){ + c( + bckt = blist[i], + Clinic = unlist(strsplit(blist[i],'_'))[1], + ymd = unlist(strsplit(blist[i],'_'))[2], + pct_retained = sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0), + pct_possible = sum(windowFreqMatrix[i,] > 0)/Max_Order, + complexity = estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ), + Dist_from_reference = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ), + Dist_from_next = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) + ) + }))) - # Now filter out the edges - for (f in filter_list) { + # correlate each row with the first one stick it in a dataframe + # df =data.frame(window=1:(nWindows-1), + # Clinic_ymd = blist[1:(nWindows-1)], + # Clinic= unlist(lapply(1:(nWindows-1), + # function(i){ unlist(strsplit(blist[i],'_'))[1] })), + # ymd=unlist(lapply(1:(nWindows-1), + # function(i){ unlist(strsplit(blist[i],'_'))[2] })), + # pct_retained = unlist(lapply(1:(nWindows-1), + # function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/ + # sum(windowFreqMatrix[i,]>0) + # })) , + # pct_possible = unlist(lapply(1:(nWindows-1), + # function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) , + # complexity = unlist(lapply(1:(nWindows-1), + # function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) , + # Dist_from_reference= unlist(lapply(1:(nWindows-1), + # function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), + # method='cosine' ) })), + # Dist_from_next= unlist(lapply(1:(nWindows-1), + # function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), + # method='cosine' ) })) + # ) + + save(df, file=paste0(save_file_name,'.rData') ) - # if the number if below the threshold, set it to zero - windowFreqMatrix[fm<=f] = 0 - # correlate each row with the first one stick it in a dataframe - df =data.frame(window=1:(nWindows-1), - Clinic_ymd = blist[1:(nWindows-1)], - Clinic= unlist(lapply(1:(nWindows-1), - function(i){ unlist(strsplit(blist[i],'_'))[1] })), - ymd=unlist(lapply(1:(nWindows-1), - function(i){ unlist(strsplit(blist[i],'_'))[2] })), - pct_retained = unlist(lapply(1:(nWindows-1), - function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/ - sum(windowFreqMatrix[i,]>0) - })) , - pct_possible = unlist(lapply(1:(nWindows-1), - function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) , - complexity = unlist(lapply(1:(nWindows-1), - function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) , - Dist_from_reference= unlist(lapply(1:(nWindows-1), - function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), - method='cosine' ) })), - Dist_from_next= unlist(lapply(1:(nWindows-1), - function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), - method='cosine' ) })) - ) - - save(df, file=paste0(save_file_name,f,'.rData') ) - - } # f in filter list - return(df) # get the ngram data and labels diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index e0fb1ac..a6e05a5 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -34,8 +34,11 @@ plots_for_papers <- function(){ plot(cds$Clinic.x,cds$A_NetComplexity) # Clinic change over time, with and without smoothing - ggplot(data = cds, aes(x = ymd, y = corr_with_first, group=Clinic)) + geom_line(aes(color=Clinic)) - ggplot(data = cds, aes(x = ymd, y = rollmean(corr_with_first,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic)) + ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Clinic)) + geom_line(aes(color=Clinic)) + ggplot(data = cds, aes(x = ymd, y = rollmean(Dist_from_reference,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic)) + + # ROle change over time + ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role)) } From 71c25050dba090b8963ed097572c30c642528585 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sun, 25 Nov 2018 13:26:34 -0500 Subject: [PATCH 14/31] making some small changes and comments --- Data Prep/ACHR_Clinic_Trajectories_v3.R | 33 ++++++------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 5553868..e203b73 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -8,6 +8,11 @@ # these functions will take clinic data and produce trajectories (distance of the graph from a reference graph) +# need to rethink how this selects occurrences in buckets. +# Should buckets always be by clinic_day?? This allows us to track what happens in each clinic over time. + +# We want to track roles over time in each clinic. So it's Clinic_day and also visit_role. We have threads by visit_role + # start with auditfinal_10022018.csv --> use ACHR_Batch_v2 to make new_occ. # The code here starts with new_occ (~ 7.72m occurrences that we break down into 1857 clinic_days and 57835 visits) @@ -84,7 +89,8 @@ graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference bcount= bcount +1 # print(paste('b =',b)) - # get text vector for the whole data set. Bucket needs at least 2 threads + # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... + # get text vector for the whole data set. Bucket needs at least 2 threads # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. th = get_bucket(e, b) if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } @@ -155,33 +161,10 @@ graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference ) }))) - # correlate each row with the first one stick it in a dataframe - # df =data.frame(window=1:(nWindows-1), - # Clinic_ymd = blist[1:(nWindows-1)], - # Clinic= unlist(lapply(1:(nWindows-1), - # function(i){ unlist(strsplit(blist[i],'_'))[1] })), - # ymd=unlist(lapply(1:(nWindows-1), - # function(i){ unlist(strsplit(blist[i],'_'))[2] })), - # pct_retained = unlist(lapply(1:(nWindows-1), - # function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/ - # sum(windowFreqMatrix[i,]>0) - # })) , - # pct_possible = unlist(lapply(1:(nWindows-1), - # function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) , - # complexity = unlist(lapply(1:(nWindows-1), - # function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) , - # Dist_from_reference= unlist(lapply(1:(nWindows-1), - # function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), - # method='cosine' ) })), - # Dist_from_next= unlist(lapply(1:(nWindows-1), - # function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), - # method='cosine' ) })) - # ) + ##### maybe use merge to get other variables included? Or do that outside the function... save(df, file=paste0(save_file_name,'.rData') ) - - return(df) # get the ngram data and labels From 250423c29658d4ac44cf62ac479011cef536f502 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sun, 25 Nov 2018 13:27:02 -0500 Subject: [PATCH 15/31] updated ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7cc7327..998975a 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ Threads+threadNum+Action_Role.Rdata emrt+Visit_ID+Role_Action_Workstation.Rdata emr+Visit_ID.Rdata Thrds+threadNum+Action_Role_Workstation.Rdata +Nov 25 am workspace.RData From be2c552838c0771409e4726b07839b006a58303a Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sat, 1 Dec 2018 00:51:28 -0500 Subject: [PATCH 16/31] lots of little changes... not done yet. --- Data Prep/ACHR_Batch_Aggregate_Data.R | 190 +++++++++++++++--------- Data Prep/ACHR_Batch_PreProcess.R | 61 ++++---- Data Prep/ACHR_Clinic_Trajectories_v3.R | 66 ++++---- Data Prep/Comparison_functions.R | 84 ----------- Stats/precedence_functions.R | 38 +++++ 5 files changed, 221 insertions(+), 218 deletions(-) create mode 100644 Stats/precedence_functions.R diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index bad8ae5..cfc7ea5 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -15,29 +15,19 @@ # 2) By Clinic_day - -############################################################################################# -# Functions to make buckets -# Each bucket is a list of thread numbers that can be used to subset the list of occurrences -# Need be careful how you call thus because it can aggregate threads in unexpected ways -# For example, if you want Visit_ID_Role, use criteria = threadNum on the appropriately threaded input -make_buckets_1 <- function(o, criteria){ - - return( unique(o[[criteria]]) ) - -} - - ################################################################### ################################################################### # This is function is set up to aggregate the occurrences within threads -- typically visits # occ = pre-processed threaded occurrences # TN = threadNum in most cases -# CFs can be chosen -- they don't have to match the POV +# CFs can be chosen +# event_CFs define changes within threads +# ALL_CFs are used to computer the CF_alignment + # need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. -ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { +ACHR_batch_threads <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) { library(tidyr) library(dplyr) @@ -48,28 +38,37 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { library(data.table) # Add columns for combinations of CFs if needed + thread_col = newColName(THREAD_CFs) + TN = thread_col + if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } + + new_event_col = newColName(EVENT_CFs) if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } all_cf_col = newColName(ALL_CFs) if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + # set key on the data.table + setkeyv(occ, TN) + # get the list of buckets - bucket_list <- make_buckets_1(occ, TN) + bucket_list <- unique(occ[[TN]]) # print the number of buckets print(paste0('Number of buckets=', length(bucket_list) )) # make data frame with results - Thrds = data.frame( t(sapply( bucket_list, + Thrds = data.table( t(sapply( bucket_list, function(b){ # select a subset - df = occ[ occ[[TN]] ==b , ] + df= occ[get(TN)==b] + + # make sure it is sorted by timestamp + df=df[order(df$tStamp),] - # Only run for visits with more than two occurrences - # get the network -- only if there are enough rows... if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) @@ -78,24 +77,25 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # compute each parameter and put them in a vector c( bucket=b, - Clinic = df[1,'Clinic'], - Physician = as.character(df[1,'Physician']), - ymd = df[1,'ymd'], - Clinic_ymd = df[1,'Clinic_ymd'], - Weekday = df[1,'Weekday'], - Month = df[1,'Month'], + Clinic = as.character(df$Clinic[1]), + Physician = as.character(df$Physician[1]), + ymd = as.character(df$ymd[1]), + Clinic_ymd = as.character(df$Clinic_ymd[1]), + Weekday = as.character(df$Weekday[1]), + Month = as.character(df$Month[1]), Phase = compute_phase(df$tStamp[1]), - threadNum = df[1,'threadNum'], - Visit_ID = df[1,'Visit_ID'], - Subject_ID = df[1,'Subject_ID'], - Role_ID = as.character( df[1,'Role_ID'] ), + threadNum = as.numeric(df$threadNum[1]), + Visit_ID = as.character(df$Visit_ID[1]), + Subject_ID = as.character( df$Subject_ID[1] ), + Role_ID = as.character( df$Role_ID[1] ), NEvents = nrow(df), - ThreadStart= as.character( df[1,'tStamp'] ), - ThreadStartInt = df[1,'tStamp'], - ThreadDuration= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ), - wait_time = compute_wait_time(df), - # Visit_number = integer(N), - # LOC_CPT = character(N), + ThreadStart= as.character(df$tStamp[1] ), + ThreadDuration= compute_thread_duration(df), + VisitDuration= compute_visit_duration(df), + wait_time1 = compute_wait_time1(df), + wait_time2 = compute_wait_time2(df), + Visit_number = as.numeric(df[1,'Visitnum_Total']), + LOC_CPT = as.character(df[1,'LOS_CPT']), NetComplexity=estimate_network_complexity( n ), Nodes=nrow(n$nodeDF), Edges=nrow(n$edgeDF), @@ -105,7 +105,7 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { NumDiagnoses = count_diagnoses(df$Diag[1]), Proc = as.character(df[1,'Proc']), Diagnosis = as.character(df[1,'Diag']), - Diagnosis_group = df[1,'Diagnosis_Group'], + Diagnosis_group = as.character(df[1,'Diagnosis_Group']), CF_Alignment = 1, # make placeholder, but compute below ALL_CF_count = length(unique(df[[all_cf_col]])), ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), @@ -136,7 +136,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) { # Compute the alignment of the context factors Thrds$CF_Alignment = as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count )) - save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')) + save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata') + save(Thrds, file=save_file_name) + + print(paste('Saved ', nrow(Thrds), " records in ",save_file_name)) return(Thrds) @@ -162,34 +165,88 @@ compute_phase <- function(t){ # else return NA return(NA) +} +compute_thread_duration <- function(df){ + + return(difftime(lubridate::ymd_hms(df$tStamp[nrow(df)]), lubridate::ymd_hms(df$tStamp[1]), units='hours' )) + } -# alignment is a measure of how much the number of nodes is increased by adding context factors -# always base this on the "Action" -- how many more "actions" are there? -# could also do this based on entropy, or write the function to use the values that were already computed -# compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){ -# -# # first get numerator -# n= length(unique(df$Action)) -# -# # Now get the total number with all of the CFs -# m= length(unique( df[[newColName(EVENT_CFs)]])) -# -# return(n/m) -# } - +compute_visit_duration <- function(df){ + + # get the checkin time. Note that they sometimes look at patient data in advance of the visit, so you + # cannot use the first occurrences to mark the start of the thread + w1= grep('CHECKIN_TIME',df$Action)[1] + w2= grep('AVS_PRINT_TIME',df$Action)[1] + + return(difftime(lubridate::ymd_hms(df$tStamp[w2]), lubridate::ymd_hms(df$tStamp[w1]), units='hours' )) + +} -# NEEDS WORK!! Far too simplistic. -compute_wait_time <- function(df){ +# First wait time... from checkin until they get their vitals taken +compute_wait_time1 <- function(df){ - # find the first occurrence of the chief complain. Seems to correspond closely to wait time - # Make sure it's not in the first 5 occurrences, and make sure to return a value of at least 1 if it never occurs - w= grep('MR_VN_CHIEF_COMPLAINT',df$Action)[1] + # find the first occurrence of MR_VN_VITALS. Corresponds closely to wait time + # w= grep('MR_VN_VITALS',df$Action)[1] + w= grep('VITALS',df$Action)[1] return( difftime( lubridate::ymd_hms(df$tStamp[w]),lubridate::ymd_hms(df$tStamp[1]), units='hours' ) ) } +# Second wait time... from vitals until they see a doctor or other medical person +compute_wait_time2 <- function(df){ + + # find the first occurrence of MR_VN_VITALS. Get the time t1 + # w1= grep('MR_VN_VITALS',df$Action)[1] + w1= grep('VITALS',df$Action)[1] + + if (is.na(w1)) + return(NA) + else + { + t1 = lubridate::ymd_hms(df$tStamp[w1]) + + + # get the workstation and role used for the vitals at t1 + work1 = df$Workstation[w1] + role1 = df$Role[w1] + + # get set of actions at the same workstation or different workstations that happen later in the visit + # Truncate the visit up to w1 + restdf = df[w1:nrow(df),] + + # then get the set of action at the same workstation or different workstations (in that visit) + sameWS = restdf[restdf$Workstation == work1,] + differentWS = restdf[restdf$Workstation != work1,] + + # get list of other roles + otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')), + collapse = '|') + + # Now find the marker for when they see a physician + # procedure is differenrt in BRKPT and HHPOB clinics + if (df$Clinic[1] %in% c('BRKPT','HHPOB') ) + { # LPN brings a different workstation on wheels at these clinics + firstLPN = grep('LPN',differentWS$Role)[1] + t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN]) + + # Other roles use the same workstation + firstDoc = grep('Physician|Resident|Registered', sameWS$Role)[1] + t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc]) + } + else + { # a different role, using the same workstation + w2 = max(w1, grep(otherRoles, sameWS$Role) )[1] + t2 = lubridate::ymd_hms(sameWS$tStamp[w2]) + } + + # return the difference in time from t1 to t2 + return( difftime( t2, t1, units='hours' ) ) + } +} + + # THIS ONE IS GOOD. count_procedures <- function( p ){ @@ -228,10 +285,7 @@ count_diagnoses <- function(d){ # This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days # occ = pre-processed threaded occurrences # TN = threadNum in most cases -# CFs can be chosen -- they don't have to match the POV - -# need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. - +# CFs can be chosen ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { library(tidyr) @@ -242,7 +296,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { library(stringr) library(data.table) - # make a list of unique buckets. Typical example + # make a list of unique buckets. bucket_list <- unique(occ[[TN]]) # print the number of buckets @@ -257,7 +311,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { # make data frame with results - cds = data.frame( t(sapply( bucket_list, + cds = data.table( t(sapply( bucket_list, function(b){ # select a subset of occurrences for the bucket @@ -641,13 +695,3 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { # return(ACHR) # } -# -# make_box_plots <- function(){ -# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity", -# color = "VisitDay", -# ylab = "Complexity", xlab = "Month (DRH)") -# } -# - - - diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index a639574..27db4af 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -13,19 +13,28 @@ # 2) Threading the data by adding ThreadNum and SeqNum to the threads # ########################################################################################################## - +library(tidyr) +library(data.table) +library(dplyr) +library(ThreadNet) +library(ngram) +library(lubridate) redo_ACHR_data_from_scratch <- function(fname){ + fname = 'audit_111818' + # Pick you point of view, or multiple POV... - THREAD_CF = c('VISIT_ID') + THREAD_CF = c('Visit_ID') EVENT_CF = c('Action','Role','Workstation') + ALL_CF = c('Action','Role','Workstation') # first read the data o = read_ACHR_data( fname ) - # Thread occurrences for each POV - occ = thread_occurrences( o, THREAD_CF, EVENT_CF ,fname ) + # Thread occurrences adds threadNum and seqNum to each thread, as defined by the the thread_CF. + # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role) + occ = thread_occurrences( o, THREAD_CF ,fname ) # Now aggregate by clinic-day for each POV clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname) @@ -33,6 +42,9 @@ redo_ACHR_data_from_scratch <- function(fname){ # Aggregate by visit for each POV visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname) + # compute trajectories to see difference from a reference graph + traj = graph_trajectory( occ, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory') + } # this function reads the raw data from URMC @@ -40,24 +52,17 @@ redo_ACHR_data_from_scratch <- function(fname){ # Tested Nov 16. read_ACHR_data <- function(fname){ - library(tidyr) - library(data.table) - library(dplyr) - library(ThreadNet) - library(ngram) - library(lubridate) - library(anytime) + # This code is tailored for reading in new data from URMC, October 2018 # Assumes there is a column called 'Timestamps" in column 13 and "Visit_ID" - # read the file into data frame + # read the file into data.table d <- fread( paste0(fname, '.csv') ) - # Sort by visit and timestamps + # Sort by visit and timestamps (will do this later with setkey) d <- arrange(d,desc(Visit_ID,asc(Timestamps))) - - ################################################################################## + # This stuff is URMC specific # # move and rename the columns, as needed @@ -68,9 +73,13 @@ read_ACHR_data <- function(fname){ setnames(d,'Timestamps','tStamp') setnames(d,'V1','seqn') - # This converts numbers to char and replaces spaces with underscore - d = cleanOccBatch(d) + # Fix the name of the Highland clinic + d$Clinic = gsub('HH POB','HHPOB',d$Clinic) + + # This converts numbers to char and replaces spaces with underscore + d = cleanOccBatch(d) + ### This is URMC specific: Add the clinic and clinic_day and fix the code for the highland clinic # get the date only from the timestamp @@ -79,11 +88,11 @@ read_ACHR_data <- function(fname){ # make new columns for clinic + day and events d = unite(d, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') - # Fix the name of the Highland clinic - d$Clinic = gsub('HH_POB','HHPOB',d$Clinic) # Save the result - save(d, file=paste0(fname, '.rData')) + save_file_name = paste0(fname, '.rData') + save(d, file=save_file_name) + print(paste("Saved ",nrow(d), " occurrences in ",save_file_name)) return(d) @@ -101,7 +110,7 @@ cleanOccBatch <- function(fileRows){ cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub(" ","_",x) }) ) # bind tStamp back to cleaned data - complete <- cbind(tStamp,cleanedCF) + complete <- as.data.table(cbind(tStamp,cleanedCF)) # Old code for forcing tStamp into a "YMD_HMS" format -- not needed for URMC EMR data # complete$tStamp <- as.character(complete$tStamp) @@ -155,16 +164,18 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ # sapply(seq(length(idx_list)), function(x) for (x in seq(length(idx_list))) { if (x %% 1000 ==0) print(paste0('Thread count = ',x)) - occ[start_idx[x]:end_idx[x],'threadNum'] = x - occ[start_idx[x]:end_idx[x],'seqNum'] = c(1:(end_idx[x]-start_idx[x]+1)) + occ[start_idx[x]:end_idx[x],'threadNum' := x ] + occ[start_idx[x]:end_idx[x],'seqNum':= c(1:(end_idx[x]-start_idx[x]+1)) ] } # Now save the result for later and return it, as well. # save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) - save(occ, file=paste0(paste(fname,new_thread_col,sep='+'), '.Rdata')) - print(paste('Saved threaded occurrences: ', nrow(occ))) + save_file_name = paste0(paste0(paste(fname,new_thread_col,sep='+'), '.rData')) + save(occ, file=save_file_name) + print(paste('Saved threaded occurrences: ', nrow(occ), " in ",save_file_name)) + return(occ) } diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index e203b73..5093f1e 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -31,7 +31,7 @@ library(zoo) # Use this to put the trajectory data with the clinic_day data merge_clinic_days_and_trajectories <- function(){ - clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date) + # clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date) cds=merge(x=test0,y=clinic_days,by.x='Clinic_ymd',by.y ='Clinic_date' ,all=TRUE) } @@ -54,34 +54,28 @@ get_bucket <- function(o, b ){ # @param e data frame for POV -# @param bucketCF is the column used to define the bucket list. Each bucket is a "window" +# @param bucket_CFs is the column used to define the bucket list, or the combination of columns needed to create the bucket. Each bucket is a "window" # @param cf is the column that defines events (e.g. 'Action_Role') # @param filter list contains the threshold level for filtering out edges with low frequency. # -# need to make sure this works with roles over time... +# MAKE SURE YOU ARE READING IN A SET OF OCCURRENCES THAT HAS THE CORRECT THREADS... # -graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference_day=1, filter_list=0,save_file_name) { +graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { - # get total size of possible matrix - Max_Order = length(unique(e[[cf]]))^2 - - - # make data frame for results - bid is the bucket_ID - vt=data.frame( ngrams=character(), freq=integer(), bid=integer() ) - - # here is the list of buckets - blist = unique(e[[bucketCF]]) - - # take the underscore out so we can use it to strsplit below... and sort it - blist=gsub('HH_POB','HHPOB',blist) - blist = sort(blist) + # add column for bucket_CF if needed + bucket_col = newColName(bucket_CFs) + if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(occ,bucket_CFs,bucket_col) } + # here is the sorted list of buckets + blist = sort(unique(e[[bucket_col]])) + # now many buckets? nb = length(blist) - print(blist) - print(paste('nb =',nb)) + # print(blist) + print(paste('Number of buckets =',nb)) - complexity_idx=list(nb) + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) bcount=0 # scan through the data @@ -92,8 +86,8 @@ graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... # get text vector for the whole data set. Bucket needs at least 2 threads # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. - th = get_bucket(e, b) - if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } + th = e[ e[[bucket_col]] ==b , ] + if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } nodes = length(unique(th[[cf]])) @@ -103,13 +97,12 @@ graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference # add the bucket number and name ngdf$bid = bcount - # ngdf$bname = b + ngdf$bname = b # append the columns to the end # vt is the whole set of all ngrams in all the windows vt=rbind(vt,ngdf) } - print(head(complexity_idx)) # convert to factor @@ -145,7 +138,7 @@ graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference # fm = windowFreqMatrix/max(windowFreqMatrix) # # # if the number if below the threshold, set it to zero - # windowFreqMatrix[fm<=f] = 0 + # windowFreqMatrix[fm<=filter_threshold] = 0 df = data.frame( t(sapply( 1:(nWindows-1), function(i){ @@ -154,25 +147,26 @@ graph_trajectory_filtered <- function(e, bucketCF, cf, n_gram_size=2, reference Clinic = unlist(strsplit(blist[i],'_'))[1], ymd = unlist(strsplit(blist[i],'_'))[2], pct_retained = sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0), - pct_possible = sum(windowFreqMatrix[i,] > 0)/Max_Order, complexity = estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ), Dist_from_reference = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ), Dist_from_next = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ) }))) - ##### maybe use merge to get other variables included? Or do that outside the function... - + + # get the ngram data and labels + if (keep_ngram_vectors) { + b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + colnames(b_df)=vt_unique$ngrams + + # stick the ngram frequencies on the end for good measure + df = cbind(df,b_df) + } + + # save the result and return it, too save(df, file=paste0(save_file_name,'.rData') ) - return(df) - - # get the ngram data and labels - # b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) - # colnames(b_df)=vt_unique$ngrams - # - # # stick the ngram frequencies on the end for good measure - # return(cbind(df,b_df)) + return(df) } diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R index ba6c6d0..7d94256 100644 --- a/Data Prep/Comparison_functions.R +++ b/Data Prep/Comparison_functions.R @@ -9,91 +9,7 @@ library(dplyr) -get_date_subsets <-function(c) { -b1 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') -b2 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') -b3 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') -b4 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') -b5 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') -rc1 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') -rc2 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') -rc3 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') -rc4 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') -rc5 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') - -smh1 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') -smh2 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') -smh3 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') -smh4 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') -smh5 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') - -hh1 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') -hh2 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') -hh3 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') -hh4 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') -hh5 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') - -} - -# c is for the clinic -# m is the metric -compare_periods <-function(c,m){ - - # b1 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06') - # b2 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01') - # b3 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15') - # b4 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01') - # b5 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31') - - - a1=routine_metric(b1,'Action',m ) - a2=routine_metric(b2,'Action',m ) - a3=routine_metric(b3,'Action',m ) - a4=routine_metric(b4,'Action',m ) - a5=routine_metric(b5,'Action',m ) - - r1=routine_metric(b1,'Role',m ) - r2=routine_metric(b2,'Role',m ) - r3=routine_metric(b3,'Role',m ) - r4=routine_metric(b4,'Role',m ) - r5=routine_metric(b5,'Role',m ) - - ws1=routine_metric(b1,'Workstation',m ) - ws2=routine_metric(b2,'Workstation',m ) - ws3=routine_metric(b3,'Workstation',m ) - ws4=routine_metric(b4,'Workstation',m ) - ws5=routine_metric(b5,'Workstation',m ) - - rc_a1=routine_metric(rc1,'Action',m ) - rc_a2=routine_metric(rc2,'Action',m ) - rc_a3=routine_metric(rc3,'Action',m ) - rc_a4=routine_metric(rc4,'Action',m ) - rc_a5=routine_metric(rc5,'Action',m ) - - smh_a1=routine_metric(smh1,'Action',m ) - smh_a2=routine_metric(smh2,'Action',m ) - smh_a3=routine_metric(smh3,'Action',m ) - smh_a4=routine_metric(smh4,'Action',m ) - smh_a5=routine_metric(smh5,'Action',m ) - - hh_a1=routine_metric(hh1,'Action',m ) - hh_a2=routine_metric(hh2,'Action',m ) - hh_a3=routine_metric(hh3,'Action',m ) - hh_a4=routine_metric(hh4,'Action',m ) - hh_a5=routine_metric(hh5,'Action',m ) - - lex=data.frame(c(a1,a2,a3,a4,a5), - c(hh_a1,hh_a2,hh_a3,hh_a4,hh_a5), - c(rc_a1,rc_a2,rc_a3,rc_a4,rc_a5), - c(smh_a1,smh_a2,smh_a3,smh_a4,smh_a5) - ) - - ws1=ws1[ws1>0]/nrow(b1) - ws2=ws2[ws2>0]/nrow(b2) - ws3=ws3[ws3>0]/nrow(b3) - -} # o is the list of occurrences diff --git a/Stats/precedence_functions.R b/Stats/precedence_functions.R new file mode 100644 index 0000000..888851e --- /dev/null +++ b/Stats/precedence_functions.R @@ -0,0 +1,38 @@ + +# need to test if this is working correctly +# It should be counting the number of times that each element in the lexicon precedes each other element. + +# make some data +S=lapply(1:10,function(x) {c('a','b','a','c',sample(letters[5:6],1+round(runif(1)*10),replace=TRUE)) }) + +# take list of sequences, S +precedes <- function(S){ + + # Get lexicon + lex = sort(unique(unlist(S))) + + # for each element in the lexicon... + result = matrix(data = sapply(lex, function(a){ + + # Find its position(s) in each sequence. pos is a list of vectors + pos=sapply(S, function(s) {grep(a,s)}) + + # now for the last position in each string, find the lexical elements that precede it + # get the substrings + subS = mapply(function(p,s){if (max(p)>1) s[1:max(p)]},pos, S) + + # Now find all of the other lexical elements that precede a in each subS + colSums(sapply(lex,function(x){str_detect(subS,x)})) } + ) , + + ncol=length(lex)) + + rownames(result) = lex + colnames(result) = lex + + # normalize by the number of sequences,so "always" = 1 + result = result/length(S) + + return(result) + +} \ No newline at end of file From fd42ea4ac6690c296961d8900f9f197e40931f2c Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sat, 1 Dec 2018 08:02:45 -0500 Subject: [PATCH 17/31] Added comments on next version... --- Data Prep/ACHR_Batch_Aggregate_Data.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index cfc7ea5..464b2b1 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -14,6 +14,11 @@ # 1) By thread (usually visit, but could be visit_role) # 2) By Clinic_day +# Rewrite for speed: +# a) go back to for loop, but just for the stuff that is computed based on the df +# b) grab first row of each thread seqNum ==1 from the table +# c) merge them back together +# d) Probably need to do different versions for visit vs. visit_role because a lot of the parameters make no sense... ################################################################### ################################################################### From 8f256b05b60d2d27ac664b9427b4addf68a1a4a0 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sat, 8 Dec 2018 14:37:26 -0500 Subject: [PATCH 18/31] Finished updating code for aggregating. Added code for Team Documenter analysis... --- Data Prep/ACHR_Batch_Aggregate_Data.R | 808 +++++++++++++++++------- Data Prep/ACHR_Clinic_Trajectories_v3.R | 154 ++++- Tables and Figures/Clinic_day_plots.R | 110 ++++ 3 files changed, 845 insertions(+), 227 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index 464b2b1..4bbd456 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -30,9 +30,8 @@ # ALL_CFs are used to computer the CF_alignment -# need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. -ACHR_batch_threads <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) { +ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { library(tidyr) library(dplyr) @@ -42,121 +41,325 @@ ACHR_batch_threads <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) { library(stringr) library(data.table) + + # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic + firstOcc=occ[occ$seqNum==1,] + + # always used threadNum for the TN + TN = 'threadNum' + # Add columns for combinations of CFs if needed - thread_col = newColName(THREAD_CFs) - TN = thread_col - if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } - + # don't need to add new column for the thread_CFs. That one has to be here. + # thread_col = newColName(THREAD_CFs) + # TN = thread_col + # if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } new_event_col = newColName(EVENT_CFs) if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } - + all_cf_col = newColName(ALL_CFs) if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } - # set key on the data.table + # set key on the data.table for the threadNum + occ=as.data.table(occ) setkeyv(occ, TN) # get the list of buckets - bucket_list <- unique(occ[[TN]]) + bucket_list <- sort( unique(occ[[TN]]) ) - # print the number of buckets - print(paste0('Number of buckets=', length(bucket_list) )) + # get the number of buckets + N = length(bucket_list) + print(paste0('Number of buckets=', N )) + # allocate the data table with N rows, then for speed, we use := to update it. + ACHR = data.table(threadNum = integer(N), + Phase = character(N), + NEvents = integer(N), + ThreadDuration =double(N), + VisitDuration= double(N), + wait_time1 = double(N), + wait_time2 = double(N), + NetComplexity=double(N), + Nodes=double(N), + Edges=double(N), + CompressRatio = double(N), + Entropy = double(N), + NumProcedures = double(N), + NumDiagnoses = double(N), + ALL_CF_count = integer(N), + ALL_CF_entropy = double(N), + CF_Alignment = double(N) ) + + # Now add columns for the CF counts. + for (cf in ALL_CFs){ + ACHR[, paste0(cf,"_count"):= double(N)] + # ACHR[, paste0(cf,"_compression"):= double(N)] + # ACHR[, paste0(cf,"_entropy"):= double(N)] + } - # make data frame with results - Thrds = data.table( t(sapply( bucket_list, - function(b){ - - # select a subset - df= occ[get(TN)==b] - - # make sure it is sorted by timestamp - df=df[order(df$tStamp),] - - # get the network -- only if there are enough rows... - - if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) - else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) - - # compute each parameter and put them in a vector - c( - bucket=b, - Clinic = as.character(df$Clinic[1]), - Physician = as.character(df$Physician[1]), - ymd = as.character(df$ymd[1]), - Clinic_ymd = as.character(df$Clinic_ymd[1]), - Weekday = as.character(df$Weekday[1]), - Month = as.character(df$Month[1]), - Phase = compute_phase(df$tStamp[1]), - threadNum = as.numeric(df$threadNum[1]), - Visit_ID = as.character(df$Visit_ID[1]), - Subject_ID = as.character( df$Subject_ID[1] ), - Role_ID = as.character( df$Role_ID[1] ), - NEvents = nrow(df), - ThreadStart= as.character(df$tStamp[1] ), - ThreadDuration= compute_thread_duration(df), - VisitDuration= compute_visit_duration(df), - wait_time1 = compute_wait_time1(df), - wait_time2 = compute_wait_time2(df), - Visit_number = as.numeric(df[1,'Visitnum_Total']), - LOC_CPT = as.character(df[1,'LOS_CPT']), - NetComplexity=estimate_network_complexity( n ), - Nodes=nrow(n$nodeDF), - Edges=nrow(n$edgeDF), - CompressRatio = compression_index(df,new_event_col), - Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), - NumProcedures = count_procedures(df$Proc[1]), - NumDiagnoses = count_diagnoses(df$Diag[1]), - Proc = as.character(df[1,'Proc']), - Diagnosis = as.character(df[1,'Diag']), - Diagnosis_group = as.character(df[1,'Diagnosis_Group']), - CF_Alignment = 1, # make placeholder, but compute below - ALL_CF_count = length(unique(df[[all_cf_col]])), - ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), - - # name these columns afterwards - sapply(ALL_CFs, function(cf){ - c( length(unique(df[[cf]])) - # , - # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) - ) }) - - ) - } ))) - - - # name the last columns -- code has to match above - cn = as.vector(sapply(ALL_CFs, function(cf){ - c( paste0(cf,"_count") - # , - # paste0(cf,"_entropy") - ) })) - - # now assign them to the last columns - last_col = ncol(Thrds) - first_col= last_col-length(cn)+1 - setnames(Thrds, c(first_col:last_col), cn) - - # Compute the alignment of the context factors - Thrds$CF_Alignment = as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count )) - - save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata') - save(Thrds, file=save_file_name) - - print(paste('Saved ', nrow(Thrds), " records in ",save_file_name)) - - return(Thrds) + # update data table with results - only compute the ones that require looking at the whole thread + for (b in 1:N){ + + if (b %% 1000 ==0) print(paste0('Thread count = ',b)) + + # select a subset + df= occ[get(TN)==bucket_list[b]] + + # make sure it is sorted by timestamp + df=df[order(df$tStamp),] + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and update the table + ACHR[b,threadNum := bucket_list[b] ] + ACHR[b,Phase := compute_phase(df$tStamp[1])] + ACHR[b,NEvents := nrow(df) ] + ACHR[b,ThreadDuration:= compute_thread_duration(df) ] + ACHR[b,VisitDuration:= compute_visit_duration(df) ] + ACHR[b, wait_time1 := compute_wait_time1(df) ] + ACHR[b, wait_time2 := compute_wait_time2(df) ] + ACHR[b, NetComplexity:=estimate_network_complexity( n ) ] + ACHR[b, Nodes:=nrow(n$nodeDF) ] + ACHR[b,Edges:=nrow(n$edgeDF) ] + ACHR[b,CompressRatio := compression_index(df,new_event_col) ] + ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ] + ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ] + ACHR[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ] + ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + + # Count the unique elements in each cf + for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] } + } + + # Compute the alignment of the context factors + ACHR$CF_Alignment = as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count )) + + # Merge the results with the first row from each thread + print('Merging results...') + Thrds=merge(x=ACHR, y=firstOcc, by.x='threadNum', by.y ='threadNum' ,all=TRUE) + save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata') + save(Thrds, file=save_file_name) + + print(paste('Saved ', nrow(Thrds), " records in ",save_file_name)) + + return(Thrds) } +################################################################################# +# streamlined version for threads that are PART OF A VISIT. So you can merge with visits +# to get the visit context. Just compute the minimum for speed. +ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + + # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic + # firstOcc=occ[occ$seqNum==1,] + + # always used threadNum for the TN + TN = 'threadNum' + + # Add columns for combinations of CFs if needed + # don't need to add new column for the thread_CFs. That one has to be here. + # thread_col = newColName(THREAD_CFs) + # TN = thread_col + # if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } + + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + all_cf_col = newColName(ALL_CFs) + if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + + # set key on the data.table for the threadNum + occ=as.data.table(occ) + setkeyv(occ, TN) + + # get the list of buckets + bucket_list <- sort( unique(occ[[TN]]) ) + + # get the number of buckets + N = length(bucket_list) + print(paste0('Number of buckets=', N )) + + # allocate the data table with N rows, then for speed, we use := to update it. + ACHR = data.table(Visit_ID = character(N), + Role_VR = character(N), + Role_ID_VR = character(N), + threadNumVR = integer(N), + NEventsVR = integer(N), + threadStartVR = character(N), + ThreadDurationVR =double(N), + NetComplexityVR=double(N), + NodesVR=double(N), + EdgesVR=double(N), + CompressRatioVR = double(N), + EntropyVR = double(N), + ALL_CF_countVR = integer(N), + ALL_CF_entropyVR = double(N) ) + + # Now add columns for the CF counts. + for (cf in ALL_CFs){ + ACHR[, paste0(cf,"_countVR"):= double(N)] + # ACHR[, paste0(cf,"_compression"):= double(N)] + # ACHR[, paste0(cf,"_entropy"):= double(N)] + } + + # update data table with results - only compute the ones that require looking at the whole thread + for (b in 1:N){ + + if (b %% 1000 ==0) print(paste0('Thread count = ',b)) + + # select a subset + df= occ[get(TN)==bucket_list[b]] + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and update the table + ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ] + ACHR[b,Role_VR:= as.character(df$Role[1]) ] + ACHR[b,Role_ID_VR:= as.character(df$Role_ID[1]) ] + ACHR[b,threadNumVR := bucket_list[b] ] + ACHR[b,NEventsVR := nrow(df) ] + ACHR[b,threadStartVR := as.character(min(lubridate::ymd_hms(df$tStamp))) ] + ACHR[b,ThreadDurationVR:= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ) ] + ACHR[b, NetComplexityVR:=estimate_network_complexity( n ) ] + ACHR[b, NodesVR:=nrow(n$nodeDF) ] + ACHR[b,EdgesVR:=nrow(n$edgeDF) ] + ACHR[b,CompressRatioVR := compression_index(df,new_event_col) ] + ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + ACHR[b,ALL_CF_countVR := length(unique(df[[all_cf_col]])) ] + ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + + # Count the unique elements in each cf + for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } + } + + # Compute the alignment of the context factors + # ACHR$CF_AlignmentVR = as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR )) + + # Merge the results with the first row from each thread + print('Merging results...') + VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) + + save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata') + save(VRThrds, file=save_file_name) + + print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name)) + + return(VRThrds) +} - -get_timeScale <- function(){'hr'} - - - -# Need to write all these functions +################################################################################# +# streamlined version for adding or recomputing columns that describe a visit. +# So you can merge with visits to get rest of the columns. Just compute the minimum for speed. +# Can re-write this as needed... +ACHR_batch_visit_add_columns <- function(occ, EVENT_CFs, visits) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + + # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic + # firstOcc=occ[occ$seqNum==1,] + + # always used threadNum for the TN + TN = 'threadNum' + + # Add columns for combinations of CFs if needed + # don't need to add new column for the thread_CFs. That one has to be here. + # thread_col = newColName(THREAD_CFs) + # TN = thread_col + # if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } + + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + + # set key on the data.table for the threadNum + occ=as.data.table(occ) + setkeyv(occ, TN) + + # get the list of buckets + bucket_list <- sort( unique(occ[[TN]]) ) + + # get the number of buckets + N = length(bucket_list) + print(paste0('Number of buckets=', N )) + + # allocate the data table with N rows, then for speed, we use := to update it. + ACHR = data.table(Visit_ID = character(N), + new_wait_time1 = double(N), + new_wait_time2 = double(N) ) + + # Role_VR = character(N), + # Role_ID_VR = character(N), + # threadNumVR = integer(N), + # NEventsVR = integer(N), + # threadStartVR = character(N), + # ThreadDurationVR =double(N), + # NetComplexityVR=double(N), + # NodesVR=double(N), + # EdgesVR=double(N), + # CompressRatioVR = double(N), + # EntropyVR = double(N), + # ALL_CF_countVR = integer(N), + # ALL_CF_entropyVR = double(N) ) + + + # update data table with results - only compute the ones that require looking at the whole thread + for (b in 1:N){ + + if (b %% 1000 ==0) print(paste0('Thread count = ',b)) + + # select a subset + df= occ[get(TN)==bucket_list[b]] + + # get the network -- only if there are enough rows... + # if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + # else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + # + # compute each parameter and update the table + ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ] + ACHR[b, new_wait_time1 := compute_wait_time1(df) ] + ACHR[b, new_wait_time2 := compute_wait_time2(df) ] + } + + + # Compute the alignment of the context factors + # ACHR$CF_AlignmentVR = as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR )) + + # Merge the results with the first row from each thread + print('Merging results...') + # VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) + VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID') + + save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata') + save(VRThrds, file=save_file_name) + + print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name)) + + return(VRThrds) +} +################################################################################## +# Helper functions compute_phase <- function(t){ t=as.Date(t) @@ -193,64 +396,105 @@ compute_wait_time1 <- function(df){ # find the first occurrence of MR_VN_VITALS. Corresponds closely to wait time # w= grep('MR_VN_VITALS',df$Action)[1] - w= grep('VITALS',df$Action)[1] - return( difftime( lubridate::ymd_hms(df$tStamp[w]),lubridate::ymd_hms(df$tStamp[1]), units='hours' ) ) + w1= grep('CHECKIN_TIME',df$Action)[1] + w2= grep('VITALS',df$Action)[1] + + return( max(0, as.numeric(difftime( lubridate::ymd_hms(df$tStamp[w2]),lubridate::ymd_hms(df$tStamp[w1]), units='hours' ) ))) } # Second wait time... from vitals until they see a doctor or other medical person +# NEW VERSION -- look for "AC_VISIT_NAVIGATOR" by a different role than "VITALS" compute_wait_time2 <- function(df){ # find the first occurrence of MR_VN_VITALS. Get the time t1 - # w1= grep('MR_VN_VITALS',df$Action)[1] + # w1= grep('MR_VN_VITALS',df$Action)[1] w1= grep('VITALS',df$Action)[1] if (is.na(w1)) return(NA) else { - t1 = lubridate::ymd_hms(df$tStamp[w1]) - - - # get the workstation and role used for the vitals at t1 - work1 = df$Workstation[w1] - role1 = df$Role[w1] - - # get set of actions at the same workstation or different workstations that happen later in the visit - # Truncate the visit up to w1 - restdf = df[w1:nrow(df),] - - # then get the set of action at the same workstation or different workstations (in that visit) - sameWS = restdf[restdf$Workstation == work1,] - differentWS = restdf[restdf$Workstation != work1,] - - # get list of other roles - otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')), - collapse = '|') - - # Now find the marker for when they see a physician - # procedure is differenrt in BRKPT and HHPOB clinics - if (df$Clinic[1] %in% c('BRKPT','HHPOB') ) - { # LPN brings a different workstation on wheels at these clinics - firstLPN = grep('LPN',differentWS$Role)[1] - t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN]) + t1 = lubridate::ymd_hms(df$tStamp[w1]) - # Other roles use the same workstation - firstDoc = grep('Physician|Resident|Registered', sameWS$Role)[1] - t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc]) - } - else - { # a different role, using the same workstation - w2 = max(w1, grep(otherRoles, sameWS$Role) )[1] - t2 = lubridate::ymd_hms(sameWS$tStamp[w2]) - } - # return the difference in time from t1 to t2 - return( difftime( t2, t1, units='hours' ) ) + # get the workstation and role used for the vitals at t1 + # work1 = df$Workstation[w1] + role1 = df$Role[w1] + + # get set of actions at the same workstation or different workstations that happen later in the visit + # Truncate the visit up to w1 + restdf = df[(w1+1):nrow(df),] + + # get the actions performed by other roles in the rest of the visit + differentRole = restdf[restdf$Role != role1,] + + # Now look for first instance of AC_VISIT_NAVIGATOR that is by one of the other roles + # first_AC_NAV = grep('AC_VISIT_NAVIGATOR',differentRole$Action)[1] + # t2 = lubridate::ymd_hms(differentRole$tStamp[first_AC_NAV]) + + # Try first instance of different role. period. + t2 = lubridate::ymd_hms(differentRole$tStamp[1]) + + + # return the difference in time from t1 to t2 + return( max(0, as.numeric(difftime( t2, t1, units='hours' ) ))) } } +# Second wait time... from vitals until they see a doctor or other medical person +# compute_wait_time2 <- function(df){ +# +# # find the first occurrence of MR_VN_VITALS. Get the time t1 +# # w1= grep('MR_VN_VITALS',df$Action)[1] +# w1= grep('VITALS',df$Action)[1] +# +# if (is.na(w1)) +# return(NA) +# else +# { +# t1 = lubridate::ymd_hms(df$tStamp[w1]) +# +# +# # get the workstation and role used for the vitals at t1 +# work1 = df$Workstation[w1] +# role1 = df$Role[w1] +# +# # get set of actions at the same workstation or different workstations that happen later in the visit +# # Truncate the visit up to w1 +# restdf = df[w1:nrow(df),] +# +# # then get the set of action at the same workstation or different workstations (in that visit) +# sameWS = restdf[restdf$Workstation == work1,] +# differentWS = restdf[restdf$Workstation != work1,] +# +# # get list of other roles +# otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')), +# collapse = '|') +# +# # Now find the marker for when they see a physician +# # procedure is differenrt in BRKPT and HHPOB clinics +# if (df$Clinic[1] %in% c('BRKPT','HHPOB') ) +# { # LPN brings a different workstation on wheels at these clinics +# firstLPN = grep('LPN',differentWS$Role)[1] +# t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN]) +# +# # Other roles use the same workstation +# firstDoc = grep('Physician|Resident|Registered', sameWS$Role)[1] +# t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc]) +# } +# else +# { # a different role, using the same workstation +# w2 = max(w1, grep(otherRoles, sameWS$Role) )[1] +# t2 = lubridate::ymd_hms(sameWS$tStamp[w2]) +# } +# +# # return the difference in time from t1 to t2 +# return( difftime( t2, t1, units='hours' ) ) +# } +# } + # THIS ONE IS GOOD. count_procedures <- function( p ){ @@ -280,7 +524,19 @@ count_diagnoses <- function(d){ return( max(1, str_count( d, '#@#' ))) } +count_daily_procedures <- function( df ) { + + # get the string of procedures for that day + allproc = unlist(df[df$seqNum==1, 'Proc']) + + if (length(allproc)>0) + return( sum(sapply(allproc, function(p) {count_procedures(p)} )) ) + else return(0) + + +} +# get_timeScale <- function(){'hr'} @@ -289,7 +545,6 @@ count_diagnoses <- function(d){ ################################################################### # This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days # occ = pre-processed threaded occurrences -# TN = threadNum in most cases # CFs can be chosen ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { @@ -303,9 +558,10 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { # make a list of unique buckets. bucket_list <- unique(occ[[TN]]) + N = length(bucket_list) # print the number of buckets - print(paste0('Number of buckets=', length(bucket_list) )) + print(paste0('Number of buckets=', N )) # Add columns for combinations of CFs if needed new_event_col = newColName(EVENT_CFs) @@ -314,78 +570,82 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { all_cf_col = newColName(ALL_CFs) if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + # this will speed up retrieving the subsets + setkeyv(occ, TN) + + # allocate the data table with N rows, then for speed, we use := to update it. + cds = data.table(Clinic_ymd = character(N), + Clinic = character(N), + ymd = character(N), + Phase = character(N), + NEvents = integer(N), + ClinicDayStart = character(N), + ClinicDayDuration =double(N), + NumVisits = integer(N), + NumUniqueDiagnosisGroups = integer(N), + NumPhysicians = integer(N), + TotalStaff = integer(N), + NetComplexity=double(N), + Nodes=double(N), + Edges=double(N), + CompressRatio = double(N), + Entropy = double(N), + ALL_CF_count = integer(N), + ALL_CF_entropy = double(N), + CF_Alignment = double(N)) + + # Now add columns for the CF counts. + for (cf in ALL_CFs){ + cds[, paste0(cf,"_countVR"):= double(N)] + # cds[, paste0(cf,"_compression"):= double(N)] + # cds[, paste0(cf,"_entropy"):= double(N)] + } + # make data frame with results - cds = data.table( t(sapply( bucket_list, - function(b){ - - # select a subset of occurrences for the bucket - df = occ[ occ[[TN]] ==b , ] - - - # get the network -- only if there are enough rows... - if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) - else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) - - # compute each parameter and put them in a vector - c( - bucket=b, - Clinic = df[1,'Clinic'], - Physician = as.character(df[1,'Physician']), - ymd = df[1,'ymd'], - Clinic_ymd = df[1,'Clinic_ymd'], - Weekday = df[1,'Weekday'], - Month = df[1,'Month'], - Phase = compute_phase(df$tStamp[1]), - threadNum = df[1,'threadNum'], - Visit_ID = df[1,'Visit_ID'], - Subject_ID = df[1,'Subject_ID'], - NEvents = nrow(df), - bucketStart= as.character( df[1,'tStamp'] ), - bucketStartInt = df[1,'tStamp'], - bucketDuration= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ), - # wait_time = compute_wait_time(df), - # Visit_number = integer(N), - # LOC_CPT = character(N), - # NumUniqueProcedures = length(unique(df[['Proc']]) - NumVisits = length(unique(df[['Visit_ID']])), - NumUniqueDiagnosisGroups = length(unique(df[['Diagnosis_Group']])), - NumPhysicians = length(unique(df[['Physician']])), - TotalStaff = length(unique(df[['Role_ID']])), - NetComplexity=estimate_network_complexity( n ), - Nodes=nrow(n$nodeDF), - Edges=nrow(n$edgeDF), - CompressRatio = compression_index(df,new_event_col), - Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), - # NumProcedures = count_procedures(df$Proc[1]), - # NumDiagnoses = count_diagnoses(df$Diag[1]), - # Diagnosis_group = df[1,'Diagnosis_Group'], - CF_Alignment = 1, # make placeholder, but compute below - ALL_CF_count = length(unique(df[[all_cf_col]])), - ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), - - # name these columns afterwards - sapply(ALL_CFs, function(cf){ - c( length(unique(df[[cf]])) - # , - # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) - ) }) - - ) - } ))) - - - # name the last columns -- code has to match above - cn = as.vector(sapply(ALL_CFs, function(cf){ - c( paste0(cf,"_count") - # , - # paste0(cf,"_entropy") - ) })) - - # now assign them to the last columns - last_col = ncol(cds) - first_col= last_col-length(cn)+1 - setnames(cds, c(first_col:last_col), cn) + for (b in 1:N){ + + + if (b %% 100 ==0) print(paste0('Thread count = ',b)) + + + # select a subset of occurrences for the bucket + df = occ[ occ[[TN]] == bucket_list[b] ] + + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and put them in a vector + + cds[b,Clinic_ymd := bucket_list[b] ] + cds[b,Clinic := as.character( df$Clinic[1] ) ] + cds[b,ymd := as.character( df$ymd[1] ) ] + cds[b,Phase := compute_phase(df$tStamp[1]) ] + cds[b,threadNum := as.numeric(df[1,'threadNum']) ] + cds[b,NEvents := nrow(df) ] + cds[b,ClinicDayStart := as.character(lubridate::ymd_hms(df$tStamp[1])) ] + cds[b,ClinicDayDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units := 'hours' ) ] + cds[b,NumVisits := length(unique( df$Visit_ID )) ] + cds[b,NumUniqueDiagnosisGroups := length(unique( df$Diagnosis_Group )) ] + cds[b,NumPhysicians := length(unique( df$Physician )) ] + cds[b,TotalStaff := length(unique( df$Role_ID )) ] + cds[b, NetComplexity := estimate_network_complexity( n ) ] + cds[b,Nodes := nrow(n$nodeDF) ] + cds[b,Edges := nrow(n$edgeDF) ] + cds[b,CompressRatio := compression_index(df,new_event_col) ] + cds[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + cds[b,NumProceduresPerDay := count_daily_procedures(df) ] + cds[b,CF_Alignment := 1 ] # make placeholder, but compute below + cds[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ] + cds[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + + # Count the unique elements in each cf + for (cf in ALL_CFs){ cds[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } + + +} # Compute the alignment of the context factors cds$CF_Alignment = as.numeric( as.character(cds$Action_count)) / as.numeric( as.character(cds$ALL_CF_count )) @@ -466,7 +726,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { # if (b%%100==0) {print(b)} # # # select the threads that go in this bucket -# df = occ[occ[[TN]] ==bucket_list[i],] +# df = occ[occ[[TN]] ==bucket_list[i] ] # # # bucket number # ACHR[b,bucket := b] @@ -618,7 +878,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { # if (b%%10==0) {print(b)} # # # select the threads that go in this bucket -# df = occ[occ[[BU]] ==bucket_list[i],] +# df = occ[occ[[BU]] ==bucket_list[i] ] # # # bucket number # ACHR[b,bucket := b] @@ -700,3 +960,125 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { # return(ACHR) # } + + +# need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. + +# ACHR_batch_threads_no_merge_old <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) { +# +# library(tidyr) +# library(dplyr) +# library(ThreadNet) +# library(ngram) +# library(lubridate) +# library(stringr) +# library(data.table) +# +# # Add columns for combinations of CFs if needed +# thread_col = newColName(THREAD_CFs) +# TN = thread_col +# if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } +# +# +# new_event_col = newColName(EVENT_CFs) +# if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } +# +# all_cf_col = newColName(ALL_CFs) +# if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } +# +# # set key on the data.table +# setkeyv(occ, TN) +# +# # get the list of buckets +# bucket_list <- unique(occ[[TN]]) +# +# # print the number of buckets +# print(paste0('Number of buckets=', length(bucket_list) )) +# +# +# # make data frame with results +# Thrds = data.table( t(sapply( bucket_list, +# function(b){ +# +# # select a subset +# df= occ[get(TN)==b] +# +# # make sure it is sorted by timestamp +# df=df[order(df$tStamp),] +# +# # get the network -- only if there are enough rows... +# +# if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) +# else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) +# +# # compute each parameter and put them in a vector +# c( +# bucket=b, +# Clinic = as.character(df$Clinic[1]), +# Physician = as.character(df$Physician[1]), +# ymd = as.character(df$ymd[1]), +# Clinic_ymd = as.character(df$Clinic_ymd[1]), +# Weekday = as.character(df$Weekday[1]), +# Month = as.character(df$Month[1]), +# Phase = compute_phase(df$tStamp[1]), +# threadNum = as.numeric(df$threadNum[1]), +# Visit_ID = as.character(df$Visit_ID[1]), +# Subject_ID = as.character( df$Subject_ID[1] ), +# Role_ID = as.character( df$Role_ID[1] ), +# NEvents = nrow(df), +# ThreadStart= as.character(df$tStamp[1] ), +# ThreadDuration= compute_thread_duration(df), +# VisitDuration= compute_visit_duration(df), +# wait_time1 = compute_wait_time1(df), +# wait_time2 = compute_wait_time2(df), +# Visit_number = as.numeric(df[1,'Visitnum_Total']), +# LOC_CPT = as.character(df[1,'LOS_CPT']), +# NetComplexity=estimate_network_complexity( n ), +# Nodes=nrow(n$nodeDF), +# Edges=nrow(n$edgeDF), +# CompressRatio = compression_index(df,new_event_col), +# Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), +# NumProcedures = count_procedures(df$Proc[1]), +# NumDiagnoses = count_diagnoses(df$Diag[1]), +# Proc = as.character(df[1,'Proc']), +# Diagnosis = as.character(df[1,'Diag']), +# Diagnosis_group = as.character(df[1,'Diagnosis_Group']), +# CF_Alignment = 1, # make placeholder, but compute below +# ALL_CF_count = length(unique(df[[all_cf_col]])), +# ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), +# +# # name these columns afterwards +# sapply(ALL_CFs, function(cf){ +# c( length(unique(df[[cf]])) +# # , +# # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) +# ) }) +# +# ) +# } ))) +# +# +# # name the last columns -- code has to match above +# cn = as.vector(sapply(ALL_CFs, function(cf){ +# c( paste0(cf,"_count") +# # , +# # paste0(cf,"_entropy") +# ) })) +# +# # now assign them to the last columns +# last_col = ncol(Thrds) +# first_col= last_col-length(cn)+1 +# setnames(Thrds, c(first_col:last_col), cn) +# +# # Compute the alignment of the context factors +# Thrds$CF_Alignment = as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count )) +# +# save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata') +# save(Thrds, file=save_file_name) +# +# print(paste('Saved ', nrow(Thrds), " records in ",save_file_name)) +# +# return(Thrds) +# +# } +# diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 5093f1e..2328452 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -87,6 +87,7 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, # get text vector for the whole data set. Bucket needs at least 2 threads # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. th = e[ e[[bucket_col]] ==b , ] + if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } @@ -139,20 +140,28 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, # # # if the number if below the threshold, set it to zero # windowFreqMatrix[fm<=filter_threshold] = 0 - - df = data.frame( t(sapply( 1:(nWindows-1), - function(i){ - c( - bckt = blist[i], - Clinic = unlist(strsplit(blist[i],'_'))[1], - ymd = unlist(strsplit(blist[i],'_'))[2], - pct_retained = sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0), - complexity = estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ), - Dist_from_reference = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ), - Dist_from_next = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) - ) - }))) + df = data.table( + Clinic_ymd = character(nWindows), + Clinic = character(nWindows), + ymd = character(nWindows), + pct_retained = double(nWindows), + complexity = double(nWindows), + Dist_from_reference = double(nWindows), + Dist_from_next = double(nWindows) + ) + + for (i in 1:(nWindows-1) ) { + + df[i, Clinic_ymd := blist[i] ] + df[i, Clinic := unlist(strsplit(blist[i],'_'))[1] ] + df[i, ymd := unlist(strsplit(blist[i],'_'))[2] ] + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] + df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] + df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] + } + # get the ngram data and labels if (keep_ngram_vectors) { @@ -170,5 +179,122 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, } - +graph_trajectory_week <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { + + # add column for bucket_CF if needed + bucket_col = newColName(bucket_CFs) + if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(occ,bucket_CFs,bucket_col) } + + # here is the sorted list of buckets + blist = sort(unique(e[[bucket_col]])) + + # now many buckets? + nb = length(blist) + # print(blist) + print(paste('Number of buckets =',nb)) + + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) + + bcount=0 + # scan through the data + for (b in blist){ + bcount= bcount +1 + # print(paste('b =',b)) + + # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... + # get text vector for the whole data set. Bucket needs at least 2 threads + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[[bucket_col]] ==b , ] + + if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } + + + nodes = length(unique(th[[cf]])) + + + # print(paste('nrow ngdf =',nrow(ngdf))) + + # add the bucket number and name + ngdf$bid = bcount + ngdf$bname = b + + # append the columns to the end + # vt is the whole set of all ngrams in all the windows + vt=rbind(vt,ngdf) + } + + + # convert to factor + vt$ngrams = factor(vt$ngrams) + + # nWindows = length(unique(vt$bid)) + nWindows = nb + #print(paste('nWindows =',nWindows)) + + # get the set of unique ngrams for the whole data set + vt_unique = data.frame(ngrams=unique(vt$ngrams)) + #print(vt_unique) + + # put the results here + windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique)) + + for (i in 1:nWindows){ + + # get the merged list + vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) + + # use the bid.y to get the whole vector, but replace the NA with zeros + bb=vtmerge[vtmerge$bid==i,'freq'] + bb[is.na(bb)] <- 0 + + windowFreqMatrix[i,]=bb + + print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0))) + + } + + # #Make a matrix to filter by where all the entries are 0 < f < 1 + # fm = windowFreqMatrix/max(windowFreqMatrix) + # + # # if the number if below the threshold, set it to zero + # windowFreqMatrix[fm<=filter_threshold] = 0 + + df = data.table( + Clinic_week = character(nWindows), + Clinic = character(nWindows), + ymd = character(nWindows), + pct_retained = double(nWindows), + complexity = double(nWindows), + Dist_from_reference = double(nWindows), + Dist_from_next = double(nWindows) + ) + + for (i in 1:(nWindows-1) ) { + df[i, Clinic_week := blist[i] ] + # df[i, Clinic_ymd := blist[i] ] + # df[i, Clinic := unlist(strsplit(blist[i],'_'))[1] ] + # df[i, ymd := unlist(strsplit(blist[i],'_'))[2] ] + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] + df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] + df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] + } + + + # get the ngram data and labels + if (keep_ngram_vectors) { + b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + colnames(b_df)=vt_unique$ngrams + + # stick the ngram frequencies on the end for good measure + df = cbind(df,b_df) + } + + # save the result and return it, too + save(df, file=paste0(save_file_name,'.rData') ) + + return(df) + +} \ No newline at end of file diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index a6e05a5..611f271 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -37,11 +37,121 @@ plots_for_papers <- function(){ ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Clinic)) + geom_line(aes(color=Clinic)) ggplot(data = cds, aes(x = ymd, y = rollmean(Dist_from_reference,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic)) + # Dec 6th version + ggplot(data = cdt, aes(x = ymd.x, y = Dist_from_reference, group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) + + ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) + + ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + + # ROle change over time ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role)) + # get the wait times and visit duration for each clinic + # convert to minutes. + a= visits %>% + group_by(Clinic) %>% + summarize(n = n(), + NEvents = mean(NEvents, na.rm = TRUE), + wait1 = mean(wait1*60, na.rm = TRUE), + sdwait1 = sd(wait1*60, na.rm = TRUE), + wait2 = mean(wait2*60, na.rm = TRUE), + sdwait2 = sd(wait2*60, na.rm = TRUE), + Duration = mean(VisitDuration, na.rm = TRUE) + ) + + # For some reason, the sd function was not working correctly above... do it again here. + visits %>% + group_by(Clinic) %>% + summarize(n = n(), + sdwait1 = sd(wait1*60, na.rm = TRUE), + sdwait2 = sd(wait2*60, na.rm = TRUE) + ) + + # get the precentage of actions by role... based on occurrences + Clinic_role_pct = otr %>% + group_by(Clinic, Role) %>% + summarize(n=n()) %>% + mutate(RolePct = n/sum(n)) + + # get LOS by clinic based on visits + Clinic_LOS_pct = visits %>% + group_by(Clinic, LOS_CPT) %>% + summarize(n=n()) %>% + mutate(LOSPct = n/sum(n)) %<% + spread(Clinic, Role) + + # Make contingency table Clinic x LOS + C_LOS = visits %>% + group_by(Clinic, LOS_CPT) %>% + summarize(n=n()) %>% + spread( LOS_CPT,n) + + # Convert NA to zero and then convert to table. Carefully remove rows/cols that make no sense + C_LOS= as.data.frame(C_LOS) + C_LOS[is.na(C_LOS)] =0 + C_LOS = matrix(unlist(C_LOS), nrow=6, ncol=20)[1:5,2:19] + chisq.test(C_LOS) + + +# Make contingency table Clinic x Role +C_R = otr %>% + group_by(Clinic, Role) %>% + summarize(n=n()) %>% + spread( Role, n) + +# Convert NA to zero +C_R = as.data.frame(C_R) +C_R[is.na(C_R)] =0 +C_R = matrix(unlist(C_R), nrow=5, ncol=10)[1:5,2:10] +chisq.test(C_R) + + Clinic_staffing = otr %>% + group_by(Clinic,Role) %>% + summarize(total_staff=n_distinct(Role_ID)) %>% + spread( Role, total_staff) + + + # Get the handoffs per visit. Use the distinct chunks in the Visit-Role threads (VRThrds) + Role_Handoffs = VRThrds %>% + group_by(Clinic,Visit_ID) %>% + summarize(ho = n_distinct(threadNumVR)) + + Role_Handoffs_by_clinic = Role_Handoffs %>% + group_by(Clinic) %>% + summarize(avg_ho = mean(ho), + median_ho = median(ho), + max_ho = max(ho), + sd_ho = sd(ho)) + + # get the average staffing levels per day + Clinic_daily_staffing = otr %>% + group_by(Clinic,ymd,Role) %>% + summarize(total_staff=n_distinct(Role_ID)) + + Clinic_AVG_daily_staffing = Clinic_daily_staffing %>% + group_by(Clinic,Role) %>% + summarize(AVG_staff=mean(total_staff)) %>% + spread( Role, AVG_staff) + +} + + +remove_outliers <- function(x, na.rm = TRUE, ...) { + qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...) + H <- 1.5 * IQR(x, na.rm = na.rm) + y <- x + y[x < (qnt[1] - H)] <- NA + y[x > (qnt[2] + H)] <- NA + y } + # # make_box_plots <- function(){ # ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity", From 169b388510549984d09c99e41473c466ca91aa9b Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 13 Dec 2018 05:35:26 -0500 Subject: [PATCH 19/31] Added functions to fix roles and compute LOS data --- Data Prep/ACHR_Batch_Aggregate_Data.R | 28 ++++++++++++++++++--- Data Prep/ACHR_Batch_PreProcess.R | 31 ++++++++++++++++++++++++ Tables and Figures/Clinic_day_plots.R | 35 +++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index 4bbd456..6e10653 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -79,6 +79,7 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { VisitDuration= double(N), wait_time1 = double(N), wait_time2 = double(N), + LOS =double(N), NetComplexity=double(N), Nodes=double(N), Edges=double(N), @@ -135,7 +136,10 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { } # Compute the alignment of the context factors - ACHR$CF_Alignment = as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count )) + ACHR$CF_Alignment := as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count )) + + # convert level of service to 1-5 integer + ACHR$LOS := convert_LOS( occ$LOS_CPT ) # Merge the results with the first row from each thread print('Merging results...') @@ -200,6 +204,7 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { NEventsVR = integer(N), threadStartVR = character(N), ThreadDurationVR =double(N), + LOS =double(N), NetComplexityVR=double(N), NodesVR=double(N), EdgesVR=double(N), @@ -250,6 +255,9 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { # Compute the alignment of the context factors # ACHR$CF_AlignmentVR = as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR )) + # convert level of service to 1-5 integer + ACHR$LOS := convert_LOS( occ$LOS_CPT ) + # Merge the results with the first row from each thread print('Merging results...') VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) @@ -536,10 +544,22 @@ count_daily_procedures <- function( df ) { } -# get_timeScale <- function(){'hr'} - - +# convert the LOS_CPT codes into a five point numeric scale +# operate on whole column to create new column +convert_LOS <- function(los_cpt){ + + # get the 5th character + s = sapply(los_cpt, function(cpt) { substring(as.character(cpt),5,5) } ) + + los = as.numeric(s) + + # convert 9 and NA to 1 + los[los==9]=1 + los[is.na(los)]=1 +return(los) + +} ################################################################### ################################################################### diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index 27db4af..05ac916 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -32,6 +32,9 @@ redo_ACHR_data_from_scratch <- function(fname){ # first read the data o = read_ACHR_data( fname ) + # fix the role IDs -- needs file called RoleChangeTable.csv in the local directory + o = fix_derm_role_ID(o) + # Thread occurrences adds threadNum and seqNum to each thread, as defined by the the thread_CF. # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role) occ = thread_occurrences( o, THREAD_CF ,fname ) @@ -123,6 +126,34 @@ cleanOccBatch <- function(fileRows){ return(complete) } +####################################################################################### +# In the derm data, the role "technician" was used for people who were not technicians +# Pass in the table of occurrences, fix it, and return it +# read in the list of changes +fix_derm_role_ID <- function(o){ + + # read in the new role ID + rc = read.csv('RoleChangeTable.csv') + + + # occurrences are a data.table so use := + for (r in 1:nrow(rc)) { + + old_rid = rc$OLD_Role_ID[r] + new_rid = rc$NEW_Role_ID[r] + new_role = rc$NEW_Role[r] + + o[o$Role_ID==old_rid, Role_ID := new_rid] + o[o$Role_ID==old_rid, Role := new_role] + + } + + return(o) +} + + + + ######################################################################################### # This function adds columns for the the thread, as requested ( Tested Nov 16 ) # Then it sorts by thread and tStamp and adds thread/sequence numbers to each thread. diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index 611f271..d49a4ef 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -44,8 +44,8 @@ plots_for_papers <- function(){ ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) - ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + - geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + ggplot(data = cdt_435[,1:6], aes(x = ymd, y = rollmean( Dist_from_reference, 10,na.pad=TRUE), group=Clinic)) + + geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank()) @@ -139,6 +139,37 @@ chisq.test(C_R) summarize(AVG_staff=mean(total_staff)) %>% spread( Role, AVG_staff) + # let's look at the actions by clinic_role, using the visit_role threads +actions_by_role = VRThrds %>% + group_by(Clinic,Role_VR) %>% + summarize(avgActions = mean(Action_countVR)) %>% + spread( Role_VR, avgActions) + +# Look at alignment by clinic +visits %>% + group_by(Clinic,Phase) %>% + summarize(w=max(ThreadDuration,na.rm = TRUE) ) %>% + spread( Phase, w) + + +# Get LOS by clinic-day +visits %>% + group_by(Clinic_ymd) %>% + summarize(los=mean(LOS,na.rm = TRUE) ) + + +# See if diagnoses differ by time period +Diag_by_phase = visits %>% + group_by(Phase,Diagnosis_Group) %>% + summarize(n=n() ) %>% + spread( Diagnosis_Group, n) + +# See if actions differ by time period +# cds_435_1 is a dataframe with one for for each clinic_day and ~300 columns for the actions +Action_by_phase = cds_435_1 %>% + group_by(Phase) %>% + summarize_at(8:307,median ) + } From 505f09ced5226f6082b22841ea0eee9ef6117a34 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 13 Dec 2018 06:12:50 -0500 Subject: [PATCH 20/31] fixed function for updating Role and Role_ID --- Data Prep/ACHR_Batch_PreProcess.R | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index 05ac916..af432be 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -22,6 +22,8 @@ library(lubridate) redo_ACHR_data_from_scratch <- function(fname){ + setwd("~/Documents/@NSF Healthcare Routines/Deidentified_Derm_data_Nov_2018") + fname = 'audit_111818' # Pick you point of view, or multiple POV... @@ -37,7 +39,10 @@ redo_ACHR_data_from_scratch <- function(fname){ # Thread occurrences adds threadNum and seqNum to each thread, as defined by the the thread_CF. # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role) - occ = thread_occurrences( o, THREAD_CF ,fname ) + # make two versions -- one for whole visits and one for visit_ID_Role + ot = thread_occurrences( o, 'Visit_ID' ,fname ) + otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) + # Now aggregate by clinic-day for each POV clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname) @@ -133,7 +138,7 @@ cleanOccBatch <- function(fileRows){ fix_derm_role_ID <- function(o){ # read in the new role ID - rc = read.csv('RoleChangeTable.csv') + rc = read.csv('RoleChangeTable.csv', stringsAsFactors = FALSE) # occurrences are a data.table so use := @@ -143,8 +148,14 @@ fix_derm_role_ID <- function(o){ new_rid = rc$NEW_Role_ID[r] new_role = rc$NEW_Role[r] - o[o$Role_ID==old_rid, Role_ID := new_rid] - o[o$Role_ID==old_rid, Role := new_role] + # pick out rows with old Role_ID + # old_rid_vector = o$Role_ID==old_rid + # + # o[old_rid_vector, Role_ID := new_rid] + # o[old_rid_vector, Role := new_role] + + # crazy data table syntax... do it all in one shot + o[o$Role_ID==old_rid, ':=' (Role = new_role, Role_ID = new_rid ) ] } From d5ea7c8987283d5bc7929f0c7ca33fec6914dbd2 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Fri, 14 Dec 2018 07:24:02 -0500 Subject: [PATCH 21/31] Fixed bugs with LOS assignment --- Data Prep/ACHR_Batch_Aggregate_Data.R | 15 +++++---------- Data Prep/ACHR_Batch_PreProcess.R | 6 +++--- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index 6e10653..bdd2d57 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -136,10 +136,10 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { } # Compute the alignment of the context factors - ACHR$CF_Alignment := as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count )) + ACHR$CF_Alignment = as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count )) - # convert level of service to 1-5 integer - ACHR$LOS := convert_LOS( occ$LOS_CPT ) + # convert level of service to 1-5 integer + ACHR$LOS = convert_LOS( firstOcc$LOS_CPT ) # Merge the results with the first row from each thread print('Merging results...') @@ -252,11 +252,6 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } } - # Compute the alignment of the context factors - # ACHR$CF_AlignmentVR = as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR )) - - # convert level of service to 1-5 integer - ACHR$LOS := convert_LOS( occ$LOS_CPT ) # Merge the results with the first row from each thread print('Merging results...') @@ -554,8 +549,8 @@ convert_LOS <- function(los_cpt){ los = as.numeric(s) # convert 9 and NA to 1 - los[los==9]=1 - los[is.na(los)]=1 + los[los==9]=NA + return(los) diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index af432be..549c8aa 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -32,7 +32,7 @@ redo_ACHR_data_from_scratch <- function(fname){ ALL_CF = c('Action','Role','Workstation') # first read the data - o = read_ACHR_data( fname ) + o = read_ACHR_data( fname ) # nrow(o) = 7728927 # fix the role IDs -- needs file called RoleChangeTable.csv in the local directory o = fix_derm_role_ID(o) @@ -40,8 +40,8 @@ redo_ACHR_data_from_scratch <- function(fname){ # Thread occurrences adds threadNum and seqNum to each thread, as defined by the the thread_CF. # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role) # make two versions -- one for whole visits and one for visit_ID_Role - ot = thread_occurrences( o, 'Visit_ID' ,fname ) - otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) + ot = thread_occurrences( o, 'Visit_ID' ,fname ) # nrow(ot)= 57835 + otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666 # Now aggregate by clinic-day for each POV From 91202e290b3a8b6cd41a0ea0da205cdc77ca732d Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Tue, 18 Dec 2018 23:19:11 -0500 Subject: [PATCH 22/31] Added function to find and count patterns of action --- .gitignore | 1 + Data Prep/ACHR_Batch_PreProcess.R | 9 +++-- Tables and Figures/Clinic_day_plots.R | 47 +++++++++++++++++++++++---- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 998975a..496052d 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ emrt+Visit_ID+Role_Action_Workstation.Rdata emr+Visit_ID.Rdata Thrds+threadNum+Action_Role_Workstation.Rdata Nov 25 am workspace.RData +deleteme.rData diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index 549c8aa..3b46176 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -48,10 +48,15 @@ redo_ACHR_data_from_scratch <- function(fname){ clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname) # Aggregate by visit for each POV - visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname) + # Whole visits + visits = ACHR_batch_visits(ot, THREAD_CF, EVENT_CF, fname) + + # And then by role + VRThreads <- ACHR_batch_visit_role_threads(otr, c('Action'), c('Action','Role','Workstation') , visits) + # compute trajectories to see difference from a reference graph - traj = graph_trajectory( occ, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory') + traj = graph_trajectory( ot, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory') } diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index d49a4ef..af69091 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -77,13 +77,15 @@ plots_for_papers <- function(){ Clinic_role_pct = otr %>% group_by(Clinic, Role) %>% summarize(n=n()) %>% - mutate(RolePct = n/sum(n)) + mutate(RolePct = n/sum(n)) %>% + select(Clinic, Role, RolePct) %>% + spread(Role, RolePct) # get LOS by clinic based on visits Clinic_LOS_pct = visits %>% group_by(Clinic, LOS_CPT) %>% summarize(n=n()) %>% - mutate(LOSPct = n/sum(n)) %<% + mutate(LOSPct = n/sum(n)) %>% spread(Clinic, Role) # Make contingency table Clinic x LOS @@ -118,7 +120,7 @@ chisq.test(C_R) # Get the handoffs per visit. Use the distinct chunks in the Visit-Role threads (VRThrds) - Role_Handoffs = VRThrds %>% + Role_Handoffs = VRThreads %>% group_by(Clinic,Visit_ID) %>% summarize(ho = n_distinct(threadNumVR)) @@ -136,11 +138,16 @@ chisq.test(C_R) Clinic_AVG_daily_staffing = Clinic_daily_staffing %>% group_by(Clinic,Role) %>% - summarize(AVG_staff=mean(total_staff)) %>% + summarize(AVG_staff=mean(total_staff, na.rm = TRUE)) %>% + spread( Role, AVG_staff) + + Clinic_AVG_daily_staffing = Clinic_daily_staffing %>% + group_by(Clinic,Role) %>% + summarize(AVG_staff=median(total_staff, na.rm = TRUE)) %>% spread( Role, AVG_staff) # let's look at the actions by clinic_role, using the visit_role threads -actions_by_role = VRThrds %>% +actions_by_role = VRThreads %>% group_by(Clinic,Role_VR) %>% summarize(avgActions = mean(Action_countVR)) %>% spread( Role_VR, avgActions) @@ -153,7 +160,7 @@ visits %>% # Get LOS by clinic-day -visits %>% +LOS_by_clinic_day = visits %>% group_by(Clinic_ymd) %>% summarize(los=mean(LOS,na.rm = TRUE) ) @@ -182,6 +189,34 @@ remove_outliers <- function(x, na.rm = TRUE, ...) { y } +# converts visit_ID into character string with factor levels for actions +make_visit_string <- function(occ, vid){ + + return(paste(as.character(as.numeric(occ$Action[occ$Visit_ID==as.character(vid)])),collapse = '-')) +} + +# Converts list of actions into regex of factor levels +make_process_pattern <- function(lvls, p){ + + return( paste( sapply(p, function(x) {as.character(which(lvls==x))}) , collapse = '.*') ) +} + +# Find process patterns +find_process_pattern <- function(occ, p, vid){ + + x = make_process_pattern( levels(ot$Action), p) + print(x) + + m = sapply(vid, function(v) { + (grepl( x, make_visit_string(occ, v))) } ) + # + + +# m = grep( x, make_visit_string(occ, vid)) + + return(m) + +} # # make_box_plots <- function(){ From 415aae73bc9828275daba8eb4579dd44918ca4c8 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Fri, 21 Dec 2018 13:22:42 -0500 Subject: [PATCH 23/31] minor changes --- Data Prep/ACHR_Clinic_Trajectories_v3.R | 2 ++ Tables and Figures/Clinic_day_plots.R | 33 ++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 2328452..21002bf 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -178,7 +178,9 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, return(df) } +################################################################################## +################################################################################## graph_trajectory_week <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { # add column for bucket_CF if needed diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index af69091..6172796 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -44,11 +44,38 @@ plots_for_papers <- function(){ ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) - ggplot(data = cdt_435[,1:6], aes(x = ymd, y = rollmean( Dist_from_reference, 10,na.pad=TRUE), group=Clinic)) + - geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank()) + ggplot(data = cds_435_1 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + ggplot(data = cds_435_1 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) - + # Here are the pairs + ggplot(data = cds_435 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + # ROle change over time ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role)) From 47e1507fba2415e1534aef62361f0704b167359a Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Fri, 21 Dec 2018 13:23:33 -0500 Subject: [PATCH 24/31] added plots for clinic days --- Tables and Figures/Clinic_day_plots.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index 6172796..18a6ce7 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -41,9 +41,7 @@ plots_for_papers <- function(){ ggplot(data = cdt, aes(x = ymd.x, y = Dist_from_reference, group=Clinic.x)) + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) - ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + - geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) - + ggplot(data = cds_435_1 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) @@ -75,6 +73,11 @@ plots_for_papers <- function(){ ggplot(data = cds_435 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + # these are based on sub-sets of the data by phase + ggplot(data = cdp2, aes(x = ymd, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic)) + + geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) + + # ROle change over time ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role)) From 2c891d59adaeb097f2cf7b3d70cb33b8d63bf2de Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Sun, 6 Jan 2019 21:12:12 -0500 Subject: [PATCH 25/31] dual window for self comparison --- Data Prep/ACHR_Clinic_Trajectories_v3.R | 2 +- Tables and Figures/Clinic_day_plots.R | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 21002bf..06cd5ad 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -122,7 +122,7 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, for (i in 1:nWindows){ - # get the merged list + # get the merged listunqi vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) # use the bid.y to get the whole vector, but replace the NA with zeros diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index 18a6ce7..b105277 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -248,6 +248,20 @@ find_process_pattern <- function(occ, p, vid){ } +dualWindowCosine <- function(df, clinic_col, colrange, windowsize){ + + + rdf = data.frame(clinic = df[windowsize:(nrow(df)-windowsize),..clinic_col], + ymd = df[windowsize:(nrow(df)-windowsize),ymd], + corr = sapply(windowsize:(nrow(df)-windowsize), + function(x) { + distance(rbind( colSums(df[(x-windowsize+1):x,..colrange]),colSums(df[(x+1):(x+windowsize), ..colrange ]) ), + method='cosine' ) }) + ) + return(rdf) +} + + # # make_box_plots <- function(){ # ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity", From c2a1df228a71c197bc5e8cdd5bd8aa5821f6f763 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 18 Apr 2019 21:24:55 -0400 Subject: [PATCH 26/31] many changes --- Data Prep/ACHR_Batch_Aggregate_Data.R | 464 ++---------------------- Data Prep/ACHR_Batch_V3.R | 45 ++- Data Prep/ACHR_Clinic_Trajectories_v3.R | 150 +++++++- Data Prep/Comparison_functions.R | 17 + 4 files changed, 230 insertions(+), 446 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index bdd2d57..8accb84 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -20,6 +20,8 @@ # c) merge them back together # d) Probably need to do different versions for visit vs. visit_role because a lot of the parameters make no sense... +# April 2019 Added compute_graph_entropy + ################################################################### ################################################################### # This is function is set up to aggregate the occurrences within threads -- typically visits @@ -27,7 +29,7 @@ # TN = threadNum in most cases # CFs can be chosen # event_CFs define changes within threads -# ALL_CFs are used to computer the CF_alignment +# ALL_CFs are used to compute the CF_alignment @@ -125,11 +127,13 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { ACHR[b, Nodes:=nrow(n$nodeDF) ] ACHR[b,Edges:=nrow(n$edgeDF) ] ACHR[b,CompressRatio := compression_index(df,new_event_col) ] - ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + # ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + ACHR[b,Entropy := compute_graph_entropy( df[[new_event_col]]) ] ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ] ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ] ACHR[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ] - ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + # ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + ACHR[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]]) ] # Count the unique elements in each cf for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] } @@ -244,9 +248,12 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { ACHR[b, NodesVR:=nrow(n$nodeDF) ] ACHR[b,EdgesVR:=nrow(n$edgeDF) ] ACHR[b,CompressRatioVR := compression_index(df,new_event_col) ] - ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + # ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + ACHR[b,EntropyVR := compute_graph_entropy( df[[new_event_col]]) ] + ACHR[b,ALL_CF_countVR := length(unique(df[[all_cf_col]])) ] - ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + # ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + ACHR[b,ALL_CF_entropyVR := compute_graph_entropy( df[[all_cf_col]]) ] # Count the unique elements in each cf for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } @@ -650,11 +657,14 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { cds[b,Nodes := nrow(n$nodeDF) ] cds[b,Edges := nrow(n$edgeDF) ] cds[b,CompressRatio := compression_index(df,new_event_col) ] - cds[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + # cds[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + cds[b,Entropy := compute_graph_entropy( df[[new_event_col]]) ] + cds[b,NumProceduresPerDay := count_daily_procedures(df) ] cds[b,CF_Alignment := 1 ] # make placeholder, but compute below cds[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ] - cds[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + # cds[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + cds[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]]) ] # Count the unique elements in each cf for (cf in ALL_CFs){ cds[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } @@ -671,429 +681,19 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { } +compute_graph_entropy <- function(s){ + + # guard against insufficient data... + if (length(s) <2) return(0) + + # convert s into text vector + text_vector = concatenate(s) + + # get the 2-grams in the sequence s. 'prop' is the proportion of each edge. It sums to 1. + p = get.phrasetable(ngram(text_vector,2))[['prop']] + + # return Shannon entropy + return(-sum(p*log(p))) + +} -################################################################### -################################################################### -## OLD ### -## OLD ### -## OLD ### -# ACHR_batch_threads_old <- function(occ,TN, EVENT_CFs, ALL_CFs) { -# -# library(tidyr) -# library(dplyr) -# library(ThreadNet) -# library(ngram) -# library(lubridate) -# library(stringr) -# -# new_event_col = newColName(EVENT_CFs) -# if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } -# -# # pick subsets --here we aggregate one thread at a time -# bucket_list <- make_buckets_1(occ, TN) -# -# # get the size (number of buckets) -# N = length(bucket_list) -# c -# # pre-allocate the result -# ACHR = data.table(bucket=integer(N), -# NEvents = integer(N), -# ThreadStart= character(N), -# ThreadStartInt = integer(N), -# ThreadDuration =double(N), -# NetComplexity=double(N), -# Nodes=double(N), -# Edges=double(N), -# CompressRatio = double(N), -# Entropy = double(N), -# NumProcedures = double(N), -# NumDiagnoses = double(N), -# wait_time = double(N), -# Visit_number = integer(N), -# LOC_CPT = character(N), -# Visit_ID = character(N), -# Subject_ID = character(N), -# Clinic = character(N), -# ymd = character(N), -# Clinic_ymd = character(N), -# Procedure = character(N), -# Diagnosis = character(N), # diag in the raw data -# Diagnosis_group = character(N), -# Physician = character(N), -# Weekday = character(N), -# Month = character(N), -# Phase = character(N), -# CF_Alignment = double(N) -# ) -# -# # Now add columns for the CFs. There will be two for each CF -# for (cf in ALL_CFs){ -# ACHR[, paste0(cf,"_count"):= double(N)] -# ACHR[, paste0(cf,"_entropy"):= double(N)] -# } -# -# # loop through the buckets. Result will be data frame with one row per bucket -# for (i in 1:N){ -# -# b = i # as.integer(bucket_list[i]) -# -# # print once every 100 visits -# if (b%%100==0) {print(b)} -# -# # select the threads that go in this bucket -# df = occ[occ[[TN]] ==bucket_list[i] ] -# -# # bucket number -# ACHR[b,bucket := b] -# -# # length of the thread (number of rows) -# ACHR[b,NEvents := nrow(df)] -# -# # only do the computations if there are more than two occurrences -# if (nrow(df) > 2) { -# -# # compute the duration of the visit in hours -# ACHR[b,ThreadDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] -# -# # compressibility of DV -# ACHR[b,CompressRatio := compression_index(df,new_event_col)] -# # ACHR[b,AR_CompressRatio := compression_index(df,DV2)] -# # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] -# -# # NetComplexity of DV -# # First get the network -# # NetComplexity of DV -# # First get the network -# n = threads_to_network_original(df,TN, new_event_col) -# ACHR[b,NetComplexity := estimate_network_complexity( n )] -# ACHR[b,Nodes := nrow(n$nodeDF) ] -# ACHR[b,Edges := nrow(n$edgeDF) ] -# -# -# # compute stuff on each context factor -# for (cf in ALL_CFs){ -# -# # Count the unique elements in each cf -# ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] -# -# # get the entropy -# ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] -# -# } -# } # kf nrows > 2 -# -# # Now copy in the rest of data -# # this only works because one visit is one bucket -# -# # ACHR[b,'VisitStart' := as.POSIXct( df[1,'tStamp']) ] -# ACHR[b,'ThreadStartInt' := df[1,'tStamp'] ] -# -# ACHR[b,'ThreadStart' := as.character( df[1,'tStamp'] ) ] -# -# ACHR[b,'Visit_ID' := df[1,'Visit_ID']] -# ACHR[b,'Subject_ID' := df[1,'Subject_ID']] -# ACHR[b,'Clinic' := df[1,'Clinic']] -# ACHR[b,'Procedure' := df[1,'Proc']] -# ACHR[b,'Diagnosis' := df[1,'Diag']] -# ACHR[b,'Diagnosis_group' := df[1,'Diagnosis_Group']] -# ACHR[b,'Physician' := df[1,'Physician']] -# ACHR[b,'Weekday' := df[1,'Weekday']] -# ACHR[b,'Month' := df[1,'Month']] -# -# -# -# } # loop thru buckets -# -# -# save(ACHR, file=paste0(paste('Threads',TN,new_event_col,sep='+'), '.Rdata')) -# -# -# # return the table -# return(ACHR) -# } - - -################################################################### -################################################################### -# ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) { -# -# # Name for column that has events -- three variations -# DV1= newColName(CFs[1]) -# DV2= newColName(CFs[1:2]) -# DV3= newColName(CFs[1:3]) -# -# print(DV1) -# print(DV2) -# print(DV3) -# -# # name for column that defines the buckets -# BU = 'Clinic_ymd' -# -# # first get the date only -# # occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d") -# -# # make new columns as needed for clinic + day and events -# # occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') -# -# -# -# # pick subsets -- one visit at a time in this version, but could be more -# bucket_list <- unique(occ[['Clinic_ymd']]) -# -# # get the size (number of buckets) -# N = length(bucket_list) -# print(N) -# -# # pre-allocate the data.table. Tables are supposed to be faster. -# ACHR = data.table(bucket=integer(N), -# Clinic_date = character(N), -# YMD_date = character(N), -# NEvents = integer(N), -# NumVisits = numeric(N), -# A_NetComplexity=double(N), -# AR_NetComplexity=double(N), -# ARW_NetComplexity=double(N), -# A_Nodes=double(N), -# AR_Nodes=double(N), -# ARW_Nodes=double(N), -# A_Edges=double(N), -# AR_Edges=double(N), -# ARW_Edges=double(N), -# A_CompressRatio = double(N), -# AR_CompressRatio = double(N), -# ARW_CompressRatio = double(N), -# A_Entropy = double(N), -# AR_Entropy = double(N), -# ARW_Entropy = double(N), -# Clinic = character(N), -# NumUniqueProcedures = numeric(N), -# NumUniqueDiagnosisGroups = numeric(N), -# NumPhysicians = numeric(N), -# Weekday = character(N), -# Month = character(N) -# ) -# -# # Now add columns for the IVs. There will be three for each IV -# -# # Add the IV columns -# for (cf in CFs){ -# -# ACHR[, paste0(cf,"_count"):= double(N)] -# ACHR[, paste0(cf,"_compression"):= double(N)] -# ACHR[, paste0(cf,"_entropy"):= double(N)] -# -# } -# -# # loop through the buckets. Result will be data frame with one row per bucket -# for (i in 1:N){ -# -# b = i # as.integer(bucket_list[i]) -# -# # print once every 10 buckets -# if (b%%10==0) {print(b)} -# -# # select the threads that go in this bucket -# df = occ[occ[[BU]] ==bucket_list[i] ] -# -# # bucket number -# ACHR[b,bucket := b] -# -# # length of the thread (number of rows) -# ACHR[b,NEvents := nrow(df)] -# -# # only do the computations if there are more than two occurrences -# if (nrow(df) > 2) { -# -# # compute the duration of the visit in hours -# # ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] -# -# # compressibility of DV -# ACHR[b,A_CompressRatio := compression_index(df,DV1)] -# ACHR[b,AR_CompressRatio := compression_index(df,DV2)] -# ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] -# -# # NetComplexity of DV -# # First get the network -# n = threads_to_network_original(df,TN, DV1) -# ACHR[b,A_NetComplexity := estimate_network_complexity( n )] -# ACHR[b,A_Nodes := nrow(n$nodeDF) ] -# ACHR[b,A_Edges := nrow(n$edgeDF) ] -# -# n = threads_to_network_original(df,TN, DV2) -# ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] -# ACHR[b,AR_Nodes := nrow(n$nodeDF) ] -# ACHR[b,AR_Edges := nrow(n$edgeDF) ] -# -# n = threads_to_network_original(df,TN, DV3) -# ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] -# ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] -# ACHR[b,ARW_Edges := nrow(n$edgeDF) ] -# -# # get the entropy for AR and ARW -# ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] -# ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] -# -# -# # compute stuff on each context factor -# for (cf in CFs){ -# -# # Count the unique elements in each cf -# ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] -# -# # get the compression -# ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] -# -# # get the entropy -# ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] -# -# } -# } # kf nrows > 2 -# -# # Now copy in the rest of data -# # this works because one visit is one bucket -# # count the number of different diagnoses --> typical indicators of complexity -# -# -# ACHR[b,'Clinic_date' := df[1,'Clinic_ymd']] -# ACHR[b,'YMD_date' := df[1,'ymd']] -# ACHR[b,'NumVisits' := length(unique(df[['Visit_ID']]))] -# ACHR[b,'Clinic' := df[1,'Clinic']] -# ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))] -# ACHR[b,'NumUniqueDiagnosisGroups' := length(unique(df[['Diagnosis_Group']]))] -# ACHR[b,'NumPhysicians' := length(unique(df[['Physician']]))] -# ACHR[b,'Weekday' := df[1,'Weekday']] -# ACHR[b,'Month' := df[1,'Month']] -# -# -# -# } # loop thru buckets -# -# # copy this one for consistency -# ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] -# -# # return the table -# return(ACHR) -# } - - - -# need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time. - -# ACHR_batch_threads_no_merge_old <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) { -# -# library(tidyr) -# library(dplyr) -# library(ThreadNet) -# library(ngram) -# library(lubridate) -# library(stringr) -# library(data.table) -# -# # Add columns for combinations of CFs if needed -# thread_col = newColName(THREAD_CFs) -# TN = thread_col -# if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } -# -# -# new_event_col = newColName(EVENT_CFs) -# if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } -# -# all_cf_col = newColName(ALL_CFs) -# if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } -# -# # set key on the data.table -# setkeyv(occ, TN) -# -# # get the list of buckets -# bucket_list <- unique(occ[[TN]]) -# -# # print the number of buckets -# print(paste0('Number of buckets=', length(bucket_list) )) -# -# -# # make data frame with results -# Thrds = data.table( t(sapply( bucket_list, -# function(b){ -# -# # select a subset -# df= occ[get(TN)==b] -# -# # make sure it is sorted by timestamp -# df=df[order(df$tStamp),] -# -# # get the network -- only if there are enough rows... -# -# if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) -# else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) -# -# # compute each parameter and put them in a vector -# c( -# bucket=b, -# Clinic = as.character(df$Clinic[1]), -# Physician = as.character(df$Physician[1]), -# ymd = as.character(df$ymd[1]), -# Clinic_ymd = as.character(df$Clinic_ymd[1]), -# Weekday = as.character(df$Weekday[1]), -# Month = as.character(df$Month[1]), -# Phase = compute_phase(df$tStamp[1]), -# threadNum = as.numeric(df$threadNum[1]), -# Visit_ID = as.character(df$Visit_ID[1]), -# Subject_ID = as.character( df$Subject_ID[1] ), -# Role_ID = as.character( df$Role_ID[1] ), -# NEvents = nrow(df), -# ThreadStart= as.character(df$tStamp[1] ), -# ThreadDuration= compute_thread_duration(df), -# VisitDuration= compute_visit_duration(df), -# wait_time1 = compute_wait_time1(df), -# wait_time2 = compute_wait_time2(df), -# Visit_number = as.numeric(df[1,'Visitnum_Total']), -# LOC_CPT = as.character(df[1,'LOS_CPT']), -# NetComplexity=estimate_network_complexity( n ), -# Nodes=nrow(n$nodeDF), -# Edges=nrow(n$edgeDF), -# CompressRatio = compression_index(df,new_event_col), -# Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]), -# NumProcedures = count_procedures(df$Proc[1]), -# NumDiagnoses = count_diagnoses(df$Diag[1]), -# Proc = as.character(df[1,'Proc']), -# Diagnosis = as.character(df[1,'Diag']), -# Diagnosis_group = as.character(df[1,'Diagnosis_Group']), -# CF_Alignment = 1, # make placeholder, but compute below -# ALL_CF_count = length(unique(df[[all_cf_col]])), -# ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), -# -# # name these columns afterwards -# sapply(ALL_CFs, function(cf){ -# c( length(unique(df[[cf]])) -# # , -# # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) -# ) }) -# -# ) -# } ))) -# -# -# # name the last columns -- code has to match above -# cn = as.vector(sapply(ALL_CFs, function(cf){ -# c( paste0(cf,"_count") -# # , -# # paste0(cf,"_entropy") -# ) })) -# -# # now assign them to the last columns -# last_col = ncol(Thrds) -# first_col= last_col-length(cn)+1 -# setnames(Thrds, c(first_col:last_col), cn) -# -# # Compute the alignment of the context factors -# Thrds$CF_Alignment = as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count )) -# -# save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata') -# save(Thrds, file=save_file_name) -# -# print(paste('Saved ', nrow(Thrds), " records in ",save_file_name)) -# -# return(Thrds) -# -# } -# diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R index a404eec..3160953 100644 --- a/Data Prep/ACHR_Batch_V3.R +++ b/Data Prep/ACHR_Batch_V3.R @@ -281,9 +281,13 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + # # get the entropy for AR and ARW + # ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + # ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + ACHR[b, AR_Entropy := compute_graph_entropy_TEST(df[[DV2]]) ] + ACHR[b, ARW_Entropy := compute_graph_entropy_TEST(df[[DV3]]) ] # compute stuff on each context factor @@ -296,7 +300,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + + # get the graph entropy + ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST(df[[cf]]) ] } } # kf nrows > 2 @@ -442,9 +449,12 @@ for (i in 1:N){ ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] ACHR[b,ARW_Edges := nrow(n$edgeDF) ] - # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + # # get the entropy for AR and ARW + # ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + # ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + + ACHR[b, AR_Entropy := compute_graph_entropy_TEST( df[[DV2]] ) ] + ACHR[b, ARW_Entropy := compute_graph_entropy_TEST( df[[DV3]] ) ] # compute stuff on each context factor @@ -457,8 +467,9 @@ for (i in 1:N){ ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] - + # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST( df[[cf]]) ] + } } # kf nrows > 2 @@ -539,3 +550,21 @@ cleanOccBatch <- function(fileRows){ get_timeScale <- function(){'hr'} +# create a function to compute graph entropy +# It will use the standard Shannon entropy, but it will apply it to the frequencies of edges in the +# adjacency matrix. To do this, we just need to count the 2-grams in the sequence and divide by the total +# number of 2-grams to get the probabilities. You should be able to pass in any sequence. +# NOTE: This version will work on one sequence at a time (eg. one patient visit) +# Needs to be revised to work on groups of visits (e.g,. clinic-days) +compute_graph_entropy_TEST <- function(s){ + + # first convert s into text vector + text_vector = concatenate(s) + + # get the 2-grams in the sequence s. ng$prop is the proportion of each edge. It sums to 1. + p = get.phrasetable(ngram(text_vector,2))[['prop']] + + # return Shannon entropy + return(-sum(p*log(p))) + +} diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 06cd5ad..0bd2cd0 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -62,6 +62,136 @@ get_bucket <- function(o, b ){ # graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { + # add column for bucket_CF if needed + bucket_col = newColName(bucket_CFs) + if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(e,bucket_CFs,bucket_col) } + + # here is the sorted list of buckets + blist = sort(unique(e[[bucket_col]])) + + # now many buckets? + nb = length(blist) + # print(blist) + print(paste('Number of buckets =',nb)) + + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) + + bcount=0 + # scan through the data + for (b in blist){ + bcount= bcount +1 + # print(paste('b =',b)) + + # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... + # get text vector for the whole data set. Bucket needs at least 2 threads + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[[bucket_col]] ==b , ] + + if (nrow(th)>3) + { + ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] + + + nodes = length(unique(th[[cf]])) + + + # print(paste('nrow ngdf =',nrow(ngdf))) + + # add the bucket number and name + ngdf$bid = bcount + ngdf$bname = b + + # append the columns to the end + # vt is the whole set of all ngrams in all the windows + vt=rbind(vt,ngdf) + } + } + + + # convert to factor + vt$ngrams = factor(vt$ngrams) + + # nWindows = length(unique(vt$bid)) + nWindows = nb + #print(paste('nWindows =',nWindows)) + + # get the set of unique ngrams for the whole data set + vt_unique = data.frame(ngrams=unique(vt$ngrams)) + #print(vt_unique) + + # put the results here + windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique)) + + for (i in 1:nWindows){ + + # get the merged listunqi + vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) + + # use the bid.y to get the whole vector, but replace the NA with zeros + bb=vtmerge[vtmerge$bid==i,'freq'] + bb[is.na(bb)] <- 0 + + windowFreqMatrix[i,]=bb + + print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0))) + + } + + # #Make a matrix to filter by where all the entries are 0 < f < 1 + # fm = windowFreqMatrix/max(windowFreqMatrix) + # + # # if the number if below the threshold, set it to zero + # windowFreqMatrix[fm<=filter_threshold] = 0 + + df = data.table( + Clinic_ymd = character(nWindows), + Clinic = character(nWindows), + # Role_ymd = character(nWindows), + # Role = character(nWindows), + ymd = character(nWindows), + pct_retained = double(nWindows), + complexity = double(nWindows), + Dist_from_reference = double(nWindows), + Dist_from_next = double(nWindows) + ) + + for (i in 1:(nWindows-1) ) { + + df[i, Clinic_ymd := blist[i] ] + df[i, Clinic := unlist(strsplit(blist[i],'_',fixed=TRUE))[1] ] + df[i, ymd := unlist(strsplit(blist[i],'_',fixed=TRUE))[2] ] + + # df[i, Role_ymd := as.character(blist[i]) ] + # df[i, Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ] + # df[i, ymd := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ] + + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] + df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] + df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] + } + + + # get the ngram data and labels + if (keep_ngram_vectors) { + b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + colnames(b_df)=vt_unique$ngrams + + # stick the ngram frequencies on the end for good measure + df = cbind(df,b_df) + } + + # save the result and return it, too + save(df, file=paste0(save_file_name,'.rData') ) + + return(df) + +} +################################################################################## + +graph_trajectory_ROLES <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { + # add column for bucket_CF if needed bucket_col = newColName(bucket_CFs) if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(occ,bucket_CFs,bucket_col) } @@ -142,8 +272,10 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, # windowFreqMatrix[fm<=filter_threshold] = 0 df = data.table( - Clinic_ymd = character(nWindows), - Clinic = character(nWindows), + # Clinic_ymd = character(nWindows), + # Clinic = character(nWindows), + Role_ymd = character(nWindows), + Role = character(nWindows), ymd = character(nWindows), pct_retained = double(nWindows), complexity = double(nWindows), @@ -153,10 +285,16 @@ graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, for (i in 1:(nWindows-1) ) { - df[i, Clinic_ymd := blist[i] ] - df[i, Clinic := unlist(strsplit(blist[i],'_'))[1] ] - df[i, ymd := unlist(strsplit(blist[i],'_'))[2] ] - df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + # df[i, Clinic_ymd := blist[i] ] + # df[i, Clinic := unlist(strsplit(blist[i],'_'))[1] ] + # df[i, ymd := unlist(strsplit(blist[i],'_'))[2] ] + + df[i, Role_ymd := as.character(blist[i]) ] + df[i, Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ] + df[i, ymd := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ] + + + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R index 7d94256..4c962f9 100644 --- a/Data Prep/Comparison_functions.R +++ b/Data Prep/Comparison_functions.R @@ -31,3 +31,20 @@ get_range <- function(o, start_date, end_date){ o %>% filter(as.Date(tStamp) > start_date & as.Date(tStamp) < end_date) } + + +# get start dates -- just a place to stick some code +# April 2019 + +get_start_dates <- function(){ + + # first get the roles that are relativeluy frequent + role_freq2 = as.data.frame(table(ot$Role_ID)[table(ot$Role_ID)>1000]) + + # then get their start dates +role_freq2$start_date = sapply(role_freq2$Var1, function(x) {min(as.character(ot$tStamp[ot$Role_ID==as.character(x)]))}) + + # then trim it so it's more readable. +role_freq2$start_date = substr(role_freq2$start_date,1,10) + +} \ No newline at end of file From 143ce33c482b3f7a15d532193ccfee12e81cc3c6 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 18 Apr 2019 21:26:20 -0400 Subject: [PATCH 27/31] ignore --- .gitignore | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 496052d..4859490 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,16 @@ emr+Visit_ID.Rdata Thrds+threadNum+Action_Role_Workstation.Rdata Nov 25 am workspace.RData deleteme.rData +.RDataTmp +CDT_435_1.rData +MW2_5 Edges.png +VRThrds+threadNum+Action.Rdata +Visits_and_cds_435_for_inkyu.rdata +cd_role_trajectory.rData +cd_role_trajectory1.rData +cd_role_trajectory2.rData +cd_wk_trajectory1.rData +cd_wk_trajectory2.rData +cds+Clinic_ymd+Action_Role_Workstation.Rdata +ot-otr-vrthreads-for-inkyu.rdata +trajectory2grams.rData From 3ccce6ebdc28a3f701ddb26154f1cb981490a584 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Mon, 6 May 2019 11:17:03 -0400 Subject: [PATCH 28/31] new version of get-nrams-for-buckets --- Data Prep/ACHR_Batch_Aggregate_Data.R | 14 ++++ Data Prep/ACHR_Clinic_Trajectories_v3.R | 96 +++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index 8accb84..a300ce0 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -697,3 +697,17 @@ compute_graph_entropy <- function(s){ } +get_action_frequencies <- function(s,n){ + + # guard against insufficient data... + if (length(s) < n) return(0) + + # convert s into text vector + text_vector = concatenate(s) + + # get the 2-grams in the sequence s. 'freq' is the frequency + return( get.phrasetable(ngram(text_vector, n )) ) + +} + + diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 0bd2cd0..56376e1 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -317,6 +317,102 @@ graph_trajectory_ROLES <- function(e, bucket_CFs, cf, n_gram_size=2, reference_ } ################################################################################## +# e = otr, needs to be set up to include threads that are visit_ID_Role +# there must be a 'threadNum" assigned in each row +# +# cf is the context factor we want to follow, such as 'Action' + + +get_ngrams_for_buckets <- function(e, cf, n_gram_size=2, filter_threshold=2, save_file_name='deleteme') { + + + + # here is the sorted list of buckets + blist = sort(unique(e$threadNum)) + + + # now many buckets? + nb = length(blist) + # print(blist) + print(paste('Number of buckets =',nb)) + + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) + + bcount=0 + # scan through the data + for (b in blist){ + bcount= bcount +1 + # print(paste('b =',b)) + + + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[['threadNum']] ==b , ] + + if (nrow(th)>filter_threshold) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] + + # add the bucket number and name + ngdf$bid = bcount + ngdf$bname = b + + # append the columns to the end + # vt is the whole set of all ngrams in all the windows + vt=rbind(vt,ngdf) + } + } + + + # convert to factor + vt$ngrams = factor(vt$ngrams) + + # nWindows = length(unique(vt$bid)) + nWindows = nb + #print(paste('nWindows =',nWindows)) + + # get the set of unique ngrams for the whole data set + vt_unique = data.frame(ngrams=unique(vt$ngrams)) + #print(vt_unique) + + # put the results here + windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique)) + + for (i in 1:nWindows){ + + # get the merged listunqi + vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) + + # use the bid.y to get the whole vector, but replace the NA with zeros + bb=vtmerge[vtmerge$bid==i,'freq'] + bb[is.na(bb)] <- 0 + + windowFreqMatrix[i,]=bb + + print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0))) + + } + + + + # get the ngram data and labels + # convert matrix to data frame + # df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + df=as.data.frame(windowFreqMatrix) + + # add the columnnames + colnames(df)=vt_unique$ngrams + + # add the bucket names + df = cbind(as.data.frame(blist), df) + colnames(df)[1] <- c("threadNum") + + + + # save the result and return it, too + #save(df, file=paste0(save_file_name,'.rData') ) + + return(df) + +} ################################################################################## graph_trajectory_week <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { From 6aa02ba4c263c45f506abf519a43078a821d40f0 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Tue, 4 Jun 2019 12:14:32 -0400 Subject: [PATCH 29/31] added notes for ortho/peds changes --- Data Prep/ACHR_Batch_Aggregate_Data.R | 24 +++++++++++++++++++++++- Data Prep/ACHR_Batch_PreProcess.R | 6 ++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index a300ce0..4a5416e 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -92,7 +92,12 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { ALL_CF_count = integer(N), ALL_CF_entropy = double(N), CF_Alignment = double(N) ) - + + ################################################# + # add columns for time of each role (or something) + + + # Now add columns for the CF counts. for (cf in ALL_CFs){ ACHR[, paste0(cf,"_count"):= double(N)] @@ -135,6 +140,12 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { # ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] ACHR[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]]) ] + ################################################# + # Compute time of each role + # need to difftime start-end of each thread -- compute_thread_duration + # sum up the role chunks for each role within each thread + + # Count the unique elements in each cf for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] } } @@ -160,6 +171,7 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { ################################################################################# # streamlined version for threads that are PART OF A VISIT. So you can merge with visits # to get the visit context. Just compute the minimum for speed. +# Needs to pass in the visit data to merge ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { library(tidyr) @@ -216,6 +228,9 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { EntropyVR = double(N), ALL_CF_countVR = integer(N), ALL_CF_entropyVR = double(N) ) + + ################################################# + # add columns for time of each role (or something) # Now add columns for the CF counts. for (cf in ALL_CFs){ @@ -255,6 +270,12 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { # ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] ACHR[b,ALL_CF_entropyVR := compute_graph_entropy( df[[all_cf_col]]) ] + ################################################# + # Compute time of each role + # need to difftime start-end of each thread -- compute_thread_duration + # sum up the role chunks for each role within each thread + + # Count the unique elements in each cf for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } } @@ -370,6 +391,7 @@ ACHR_batch_visit_add_columns <- function(occ, EVENT_CFs, visits) { } ################################################################################## # Helper functions +################################################################################# compute_phase <- function(t){ t=as.Date(t) diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R index 3b46176..96800d3 100644 --- a/Data Prep/ACHR_Batch_PreProcess.R +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -28,7 +28,8 @@ redo_ACHR_data_from_scratch <- function(fname){ # Pick you point of view, or multiple POV... THREAD_CF = c('Visit_ID') - EVENT_CF = c('Action','Role','Workstation') + # EVENT_CF = c('Action','Role','Workstation') + EVENT_CF = c('Action') ALL_CF = c('Action','Role','Workstation') # first read the data @@ -41,7 +42,8 @@ redo_ACHR_data_from_scratch <- function(fname){ # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role) # make two versions -- one for whole visits and one for visit_ID_Role ot = thread_occurrences( o, 'Visit_ID' ,fname ) # nrow(ot)= 57835 - otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666 + # otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666 + otrw = thread_occurrences( o, c('Visit_ID','Role','Workstation') ,fname ) # nrow(otrw) > 527666 # Now aggregate by clinic-day for each POV From 9ffb0407907049d7e1d95d25abc0c42d58e7495b Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Wed, 7 Aug 2019 10:21:13 -0400 Subject: [PATCH 30/31] Added new procedures to measure all the multiple networks --- Data Prep/ACHR_Batch_Aggregate_Data.R | 2 +- Data Prep/ACHR_Batch_V3.R | 298 +++++++++++++++++++++++--- 2 files changed, 273 insertions(+), 27 deletions(-) diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R index 4a5416e..3e11d1d 100644 --- a/Data Prep/ACHR_Batch_Aggregate_Data.R +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -535,7 +535,7 @@ count_procedures <- function( p ){ if (is.na( p )) return( 0 ) # get the overall number of items - total_num = str_count( p, '#@#') + total_num = str_count( p, '#@#')+1 # adjust for the visit codes that are not actual procedures v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205') diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R index 3160953..3cfa233 100644 --- a/Data Prep/ACHR_Batch_V3.R +++ b/Data Prep/ACHR_Batch_V3.R @@ -271,15 +271,15 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { ACHR[b,A_Nodes := nrow(n$nodeDF) ] ACHR[b,A_Edges := nrow(n$edgeDF) ] - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + # n = threads_to_network_original(df,TN, DV2) + # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + # ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + # ACHR[b,AR_Edges := nrow(n$edgeDF) ] + # + # n = threads_to_network_original(df,TN, DV3) + # ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + # ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + # ACHR[b,ARW_Edges := nrow(n$edgeDF) ] # # get the entropy for AR and ARW # ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] @@ -334,8 +334,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { return(ACHR) } -################################################################### -################################################################### +################################################################## +################################################################## +################################################################## +################################################################## @@ -380,6 +382,7 @@ ACHR = data.table(bucket=integer(N), A_Entropy = double(N), AR_Entropy = double(N), ARW_Entropy = double(N), + CF_Alignment = double(N), Visit_ID = character(N), Subject_ID = character(N), Clinic = character(N), @@ -427,8 +430,8 @@ for (i in 1:N){ # compressibility of DV ACHR[b,A_CompressRatio := compression_index(df,DV1)] - ACHR[b,AR_CompressRatio := compression_index(df,DV2)] - ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] + # ACHR[b,AR_CompressRatio := compression_index(df,DV2)] + # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] # NetComplexity of DV # First get the network @@ -439,23 +442,23 @@ for (i in 1:N){ ACHR[b,A_Nodes := nrow(n$nodeDF) ] ACHR[b,A_Edges := nrow(n$edgeDF) ] - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + # n = threads_to_network_original(df,TN, DV2) + # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + # ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + # ACHR[b,AR_Edges := nrow(n$edgeDF) ] + # + n = threads_to_network_original(df,TN, DV3) + ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + ACHR[b,ARW_Edges := nrow(n$edgeDF) ] # # get the entropy for AR and ARW # ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] # ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] - ACHR[b, AR_Entropy := compute_graph_entropy_TEST( df[[DV2]] ) ] - ACHR[b, ARW_Entropy := compute_graph_entropy_TEST( df[[DV3]] ) ] - + # ACHR[b, AR_Entropy := compute_graph_entropy_TEST( df[[DV2]] ) ] + ACHR[b, ARW_Entropy := compute_graph_entropy_TEST( df[[DV3]] ) ] + # # compute stuff on each context factor for (cf in CFs){ @@ -491,6 +494,8 @@ for (i in 1:N){ ACHR[b,'Weekday' := df[1,'Weekday']] ACHR[b,'Month' := df[1,'Month']] + # Compute the alignment of the context factors + ACHR[b, 'CF_Alignment' := ACHR[b,Action_count] / ACHR[b,ARW_Nodes] ] } # loop thru buckets @@ -502,6 +507,168 @@ ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] return(ACHR) } +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ + + +# ACHR stands for Antecedents of Complexity in Healthcare Routines. +# This is function is set up to compute process parameters on thousands of patient visits. +# This version assumes that CFs = c('Action','Role','Workstation') +ACHR_batch_visits_all_CFs <- function(occ,TN, CFs) { + + # Get list of columns we want to use/create + dv1=newColName(CFs[1]) + dv2=newColName(CFs[2]) + dv3=newColName(CFs[3]) + dv4=newColName(CFs[2:3]) + dv5=newColName(CFs[1:2]) + dv6=newColName(CFs[1:3]) + + # need to make new columns + occ = combineContextFactors(occ,CFs[2:3], newColName(CFs[2:3])) + occ = combineContextFactors(occ,CFs[1:2], newColName(CFs[1:2])) + occ = combineContextFactors(occ,CFs[1:3], newColName(CFs[1:3])) + + # We will loop thru this list below + dv_list = c(dv1, dv2, dv3, dv4, dv5, dv6) + print(dv_list) + + # set key on the data.table for the threadNum to speed retrival + occ=as.data.table(occ) + setkeyv(occ, TN) + + # pick subsets -- one visit at a time in this version, but could be more + bucket_list <- make_buckets_1(occ, 'threadNum') + + # get the size (number of buckets) + N = length(bucket_list) + # print(bucket_list) + + + # pre-allocate the data.table. Tables are supposed to be faster. + ACHR = data.table(bucket=integer(N), + NEvents = integer(N), + VisitStart= character(N), + VisitDuration=double(N), + CF_Alignment = double(N), + Visit_ID = character(N), + Subject_ID = character(N), + Clinic = character(N), + LOS_CPT = character(N), + NumProcedures = double(N), + NumDiagnoses = double(N), + Procedure = character(N), + Diagnosis = character(N), # diag in the raw data + Diagnosis_group = character(N), + Physician = character(N), + Weekday = character(N), + Month = character(N) + ) + + # Now add columns for each dv. + for (dv in dv_list){ + + ACHR[, paste0(dv,"_nodes"):= double(N)] + ACHR[, paste0(dv,"_edges"):= double(N)] + ACHR[, paste0(dv,"_complexity"):= double(N)] + ACHR[, paste0(dv,"_entropy"):= double(N)] + + } + + # loop through the buckets. Result will be data frame with one row per bucket + for (i in seq(1,N,1)){ + + b = i # as.integer(bucket_list[i]) + + # bucket number + ACHR[b,bucket := b] + + # print once every 100 visits + # print(b) + # print (bucket_list[i]) + + # save data every 1000 visits + if (b%%1000==0) { + print(b) + ACHR_partial = ACHR[ACHR$NEvents>1,] + save(ACHR_partial, file=paste0("visits_ARW_",nrow(ACHR_partial),'.Rdata'))} + + # select a subset + df= subset(occ, threadNum==bucket_list[i] ) + + # only do the computations if there are more than two occurrences + if (nrow(df) > 2) { + + # make sure it is sorted by timestamp + df=df[order(df$tStamp),] + + + # length of the thread (number of rows) + ACHR[b,NEvents := nrow(df)] + # print( nrow(df) ) + + + # compute the duration of the visit in hours + ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] + + # compute stuff on each DV in the list + for (dv in dv_list){ + + n = threads_to_network_local(df,TN, dv) + + ACHR[b,paste0(dv,"_complexity") := estimate_network_complexity( n )] + ACHR[b,paste0(dv,"_nodes") := nrow(n$nodeDF) ] + ACHR[b,paste0(dv,"_edges") := nrow(n$edgeDF) ] + ACHR[b, paste0(dv,"_entropy") := compute_graph_entropy( df[[dv]] ) ] + + # print (dv) + # print( nrow(n$nodeDF) ) + # print( length(unique(df[[dv]]))) + + } + } # df nrows > 3 + + # Now copy in the rest of data + # this only works because one visit is one bucket + ACHR[b,'VisitStart' := as.character(df[1,'tStamp']) ] + + ACHR[b,Visit_ID := df[1,'Visit_ID']] + ACHR[b,Subject_ID := df[1,'Subject_ID']] + ACHR[b,Clinic := df[1,'Clinic']] + ACHR[b,LOS_CPT := df[1,'LOS_CPT']] + ACHR[b,Procedure := df[1,'Proc']] + ACHR[b,Diagnosis := df[1,'Diag']] + ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ] + ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ] + ACHR[b,Diagnosis_group := df[1,'Diagnosis_Group']] + ACHR[b,Physician := df[1,'Physician']] + ACHR[b,Weekday := df[1,'Weekday']] + ACHR[b,Month := df[1,'Month']] + + # Compute the alignment of the context factors + # This assumes CFs = c('Action','Role','Workstation') + # print( ACHR[b,Action_nodes] ) + # print( ACHR[b,Action_Role_Workstation_nodes] ) + ACHR[b, 'CF_Alignment' := ACHR[b,Action_nodes] / ACHR[b,Action_Role_Workstation_nodes] ] + + } # loop thru buckets + + # save the data one last time + save(ACHR, file=paste0("visits_ARW_",b,'.Rdata')) + + # return the table + return(ACHR) +} + + +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ + # Each bucket is a list of thread numbers that can be used to subset the list of occurrences make_buckets_1 <- function(o, criteria){ @@ -559,7 +726,10 @@ get_timeScale <- function(){'hr'} compute_graph_entropy_TEST <- function(s){ # first convert s into text vector - text_vector = concatenate(s) + text_vector = long_enough( concatenate(s) , 2, ' ') + # text_vector = concatenate(s) + + # if (length(text_vector)<2) {return(0)} # get the 2-grams in the sequence s. ng$prop is the proportion of each edge. It sums to 1. p = get.phrasetable(ngram(text_vector,2))[['prop']] @@ -568,3 +738,79 @@ compute_graph_entropy_TEST <- function(s){ return(-sum(p*log(p))) } +# to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n +# this gets used in various places so need to pass in the delimiter +long_enough = function(tv,n,delimiter){ + + return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ]) + +} + +threads_to_network_local <- function(et,TN,CF,grp='threadNum'){ + + # print(head(et)) + # + # print(paste('CF=', CF)) + # print(paste('grp=', grp)) + + # First get the node names & remove the spaces + node_label = levels(factor(et[[CF]])) # unique(et[[CF]]) + node_label=str_replace_all(node_label," ","_") + nNodes = length(node_label) + + # print("node_label") + # print(node_label) + # print(paste('nNodes=', nNodes)) + + node_group=character() + for (n in 1:nNodes){ + # hardcoded threadNum for data.table syntax + node_group = c(node_group, as.character(unlist( et[which(et[[CF]]==node_label[n]),threadNum][1]) ) ) + } + + # set up the data frames we need to draw the network + nodes = data.frame( + id = 1:length(node_label), + label = node_label, + Group = node_group, + title=node_label) + + # get the 2 grams for the edges + ngdf = count_ngrams(et,TN, CF, 2) + + # Adjust the frequency of the edges to 0-1 range + ngdf$freq = round(ngdf$freq/max(ngdf$freq),3) + + # need to split 2-grams into from and to + from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2) + + # need to find a better way to do this... + nEdges = length(from_to_str) + # from_labels=matrix(data="", nrow=nEdges,ncol=1) + # to_labels =matrix(data="", nrow=nEdges,ncol=1) + # from=integer(nEdges) + # to=integer(nEdges) + # for (i in 1:length(from_to_str)){ + # + # # Get from and to by spliting the 2-gram + # from_labels[i] = str_split(from_to_str[[i]]," ")[1] + # to_labels[i] = str_split(from_to_str[[i]]," ")[2] + # + # # use match to lookup the nodeID from the label... + # from[i] = match(from_labels[i], nodes$label) + # to[i] = match(to_labels[i], nodes$label) + # } + # + # Stopped filtering out selfies July 20, 2019 for Kerstin Sailer bug report + edges = data.frame( + # from, + # to, + label = ngdf$freq, + Value =ngdf$freq) # %>% filter(!from==to) + + # print(paste("T2N nodes:",nodes)) + # print(paste("ngdf = :",ngdf)) + # print(paste("edges= :",edges)) + + return(list(nodeDF = nodes, edgeDF = edges)) +} From a5bb511507815fbb3f57ca6c204f062c897c6333 Mon Sep 17 00:00:00 2001 From: Brian Pentland Date: Thu, 29 Aug 2019 14:40:01 -0400 Subject: [PATCH 31/31] bug fix --- Data Prep/ACHR_Batch_V3.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R index 3cfa233..bb80ee7 100644 --- a/Data Prep/ACHR_Batch_V3.R +++ b/Data Prep/ACHR_Batch_V3.R @@ -577,7 +577,7 @@ ACHR_batch_visits_all_CFs <- function(occ,TN, CFs) { } # loop through the buckets. Result will be data frame with one row per bucket - for (i in seq(1,N,1)){ + for (i in seq(N,1,-1)){ b = i # as.integer(bucket_list[i])