diff --git a/.gitignore b/.gitignore index dd0b92d..4859490 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,25 @@ vignettes/*.pdf rsconnect/ >>>>>>> be0331537859f5addf650269855aa0a9d49ca5d5 .Rproj.user +emrt+Visit_ID+Role_Action.Rdata +Thrds+threadNum+Action.Rdata +Threads+threadNum+Action.Rdata +Threads+threadNum+Action_Role.Rdata +emrt+Visit_ID+Role_Action_Workstation.Rdata +emr+Visit_ID.Rdata +Thrds+threadNum+Action_Role_Workstation.Rdata +Nov 25 am workspace.RData +deleteme.rData +.RDataTmp +CDT_435_1.rData +MW2_5 Edges.png +VRThrds+threadNum+Action.Rdata +Visits_and_cds_435_for_inkyu.rdata +cd_role_trajectory.rData +cd_role_trajectory1.rData +cd_role_trajectory2.rData +cd_wk_trajectory1.rData +cd_wk_trajectory2.rData +cds+Clinic_ymd+Action_Role_Workstation.Rdata +ot-otr-vrthreads-for-inkyu.rdata +trajectory2grams.rData diff --git a/Data Prep/.gitignore b/Data Prep/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R new file mode 100644 index 0000000..3e11d1d --- /dev/null +++ b/Data Prep/ACHR_Batch_Aggregate_Data.R @@ -0,0 +1,735 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets +# +# ACHR_Batch_Aggregate_Data +# +# (c) 2018 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Brian Pentland +# Absolutely no warranty! +########################################################################################################## + + +# Two functions to aggregate the occurrences +# 1) By thread (usually visit, but could be visit_role) +# 2) By Clinic_day + +# Rewrite for speed: +# a) go back to for loop, but just for the stuff that is computed based on the df +# b) grab first row of each thread seqNum ==1 from the table +# c) merge them back together +# d) Probably need to do different versions for visit vs. visit_role because a lot of the parameters make no sense... + +# April 2019 Added compute_graph_entropy + +################################################################### +################################################################### +# This is function is set up to aggregate the occurrences within threads -- typically visits +# occ = pre-processed threaded occurrences +# TN = threadNum in most cases +# CFs can be chosen +# event_CFs define changes within threads +# ALL_CFs are used to compute the CF_alignment + + + +ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + + # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic + firstOcc=occ[occ$seqNum==1,] + + # always used threadNum for the TN + TN = 'threadNum' + + # Add columns for combinations of CFs if needed + # don't need to add new column for the thread_CFs. That one has to be here. + # thread_col = newColName(THREAD_CFs) + # TN = thread_col + # if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } + + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + all_cf_col = newColName(ALL_CFs) + if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + + # set key on the data.table for the threadNum + occ=as.data.table(occ) + setkeyv(occ, TN) + + # get the list of buckets + bucket_list <- sort( unique(occ[[TN]]) ) + + # get the number of buckets + N = length(bucket_list) + print(paste0('Number of buckets=', N )) + + # allocate the data table with N rows, then for speed, we use := to update it. + ACHR = data.table(threadNum = integer(N), + Phase = character(N), + NEvents = integer(N), + ThreadDuration =double(N), + VisitDuration= double(N), + wait_time1 = double(N), + wait_time2 = double(N), + LOS =double(N), + NetComplexity=double(N), + Nodes=double(N), + Edges=double(N), + CompressRatio = double(N), + Entropy = double(N), + NumProcedures = double(N), + NumDiagnoses = double(N), + ALL_CF_count = integer(N), + ALL_CF_entropy = double(N), + CF_Alignment = double(N) ) + + ################################################# + # add columns for time of each role (or something) + + + + # Now add columns for the CF counts. + for (cf in ALL_CFs){ + ACHR[, paste0(cf,"_count"):= double(N)] + # ACHR[, paste0(cf,"_compression"):= double(N)] + # ACHR[, paste0(cf,"_entropy"):= double(N)] + } + + # update data table with results - only compute the ones that require looking at the whole thread + for (b in 1:N){ + + if (b %% 1000 ==0) print(paste0('Thread count = ',b)) + + # select a subset + df= occ[get(TN)==bucket_list[b]] + + # make sure it is sorted by timestamp + df=df[order(df$tStamp),] + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and update the table + ACHR[b,threadNum := bucket_list[b] ] + ACHR[b,Phase := compute_phase(df$tStamp[1])] + ACHR[b,NEvents := nrow(df) ] + ACHR[b,ThreadDuration:= compute_thread_duration(df) ] + ACHR[b,VisitDuration:= compute_visit_duration(df) ] + ACHR[b, wait_time1 := compute_wait_time1(df) ] + ACHR[b, wait_time2 := compute_wait_time2(df) ] + ACHR[b, NetComplexity:=estimate_network_complexity( n ) ] + ACHR[b, Nodes:=nrow(n$nodeDF) ] + ACHR[b,Edges:=nrow(n$edgeDF) ] + ACHR[b,CompressRatio := compression_index(df,new_event_col) ] + # ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + ACHR[b,Entropy := compute_graph_entropy( df[[new_event_col]]) ] + ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ] + ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ] + ACHR[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ] + # ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + ACHR[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]]) ] + + ################################################# + # Compute time of each role + # need to difftime start-end of each thread -- compute_thread_duration + # sum up the role chunks for each role within each thread + + + # Count the unique elements in each cf + for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_count") := length(unique(df[[cf]])) ] } + } + + # Compute the alignment of the context factors + ACHR$CF_Alignment = as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count )) + + # convert level of service to 1-5 integer + ACHR$LOS = convert_LOS( firstOcc$LOS_CPT ) + + # Merge the results with the first row from each thread + print('Merging results...') + Thrds=merge(x=ACHR, y=firstOcc, by.x='threadNum', by.y ='threadNum' ,all=TRUE) + + save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata') + save(Thrds, file=save_file_name) + + print(paste('Saved ', nrow(Thrds), " records in ",save_file_name)) + + return(Thrds) +} + +################################################################################# +# streamlined version for threads that are PART OF A VISIT. So you can merge with visits +# to get the visit context. Just compute the minimum for speed. +# Needs to pass in the visit data to merge +ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + + # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic + # firstOcc=occ[occ$seqNum==1,] + + # always used threadNum for the TN + TN = 'threadNum' + + # Add columns for combinations of CFs if needed + # don't need to add new column for the thread_CFs. That one has to be here. + # thread_col = newColName(THREAD_CFs) + # TN = thread_col + # if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } + + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + all_cf_col = newColName(ALL_CFs) + if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + + # set key on the data.table for the threadNum + occ=as.data.table(occ) + setkeyv(occ, TN) + + # get the list of buckets + bucket_list <- sort( unique(occ[[TN]]) ) + + # get the number of buckets + N = length(bucket_list) + print(paste0('Number of buckets=', N )) + + # allocate the data table with N rows, then for speed, we use := to update it. + ACHR = data.table(Visit_ID = character(N), + Role_VR = character(N), + Role_ID_VR = character(N), + threadNumVR = integer(N), + NEventsVR = integer(N), + threadStartVR = character(N), + ThreadDurationVR =double(N), + LOS =double(N), + NetComplexityVR=double(N), + NodesVR=double(N), + EdgesVR=double(N), + CompressRatioVR = double(N), + EntropyVR = double(N), + ALL_CF_countVR = integer(N), + ALL_CF_entropyVR = double(N) ) + + ################################################# + # add columns for time of each role (or something) + + # Now add columns for the CF counts. + for (cf in ALL_CFs){ + ACHR[, paste0(cf,"_countVR"):= double(N)] + # ACHR[, paste0(cf,"_compression"):= double(N)] + # ACHR[, paste0(cf,"_entropy"):= double(N)] + } + + # update data table with results - only compute the ones that require looking at the whole thread + for (b in 1:N){ + + if (b %% 1000 ==0) print(paste0('Thread count = ',b)) + + # select a subset + df= occ[get(TN)==bucket_list[b]] + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and update the table + ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ] + ACHR[b,Role_VR:= as.character(df$Role[1]) ] + ACHR[b,Role_ID_VR:= as.character(df$Role_ID[1]) ] + ACHR[b,threadNumVR := bucket_list[b] ] + ACHR[b,NEventsVR := nrow(df) ] + ACHR[b,threadStartVR := as.character(min(lubridate::ymd_hms(df$tStamp))) ] + ACHR[b,ThreadDurationVR:= difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours' ) ] + ACHR[b, NetComplexityVR:=estimate_network_complexity( n ) ] + ACHR[b, NodesVR:=nrow(n$nodeDF) ] + ACHR[b,EdgesVR:=nrow(n$edgeDF) ] + ACHR[b,CompressRatioVR := compression_index(df,new_event_col) ] + # ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + ACHR[b,EntropyVR := compute_graph_entropy( df[[new_event_col]]) ] + + ACHR[b,ALL_CF_countVR := length(unique(df[[all_cf_col]])) ] + # ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + ACHR[b,ALL_CF_entropyVR := compute_graph_entropy( df[[all_cf_col]]) ] + + ################################################# + # Compute time of each role + # need to difftime start-end of each thread -- compute_thread_duration + # sum up the role chunks for each role within each thread + + + # Count the unique elements in each cf + for (cf in ALL_CFs){ ACHR[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } + } + + + # Merge the results with the first row from each thread + print('Merging results...') + VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) + + save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata') + save(VRThrds, file=save_file_name) + + print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name)) + + return(VRThrds) +} + +################################################################################# +# streamlined version for adding or recomputing columns that describe a visit. +# So you can merge with visits to get rest of the columns. Just compute the minimum for speed. +# Can re-write this as needed... +ACHR_batch_visit_add_columns <- function(occ, EVENT_CFs, visits) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + + # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic + # firstOcc=occ[occ$seqNum==1,] + + # always used threadNum for the TN + TN = 'threadNum' + + # Add columns for combinations of CFs if needed + # don't need to add new column for the thread_CFs. That one has to be here. + # thread_col = newColName(THREAD_CFs) + # TN = thread_col + # if (!(thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CFs,thread_col) } + + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + + # set key on the data.table for the threadNum + occ=as.data.table(occ) + setkeyv(occ, TN) + + # get the list of buckets + bucket_list <- sort( unique(occ[[TN]]) ) + + # get the number of buckets + N = length(bucket_list) + print(paste0('Number of buckets=', N )) + + # allocate the data table with N rows, then for speed, we use := to update it. + ACHR = data.table(Visit_ID = character(N), + new_wait_time1 = double(N), + new_wait_time2 = double(N) ) + + # Role_VR = character(N), + # Role_ID_VR = character(N), + # threadNumVR = integer(N), + # NEventsVR = integer(N), + # threadStartVR = character(N), + # ThreadDurationVR =double(N), + # NetComplexityVR=double(N), + # NodesVR=double(N), + # EdgesVR=double(N), + # CompressRatioVR = double(N), + # EntropyVR = double(N), + # ALL_CF_countVR = integer(N), + # ALL_CF_entropyVR = double(N) ) + + + # update data table with results - only compute the ones that require looking at the whole thread + for (b in 1:N){ + + if (b %% 1000 ==0) print(paste0('Thread count = ',b)) + + # select a subset + df= occ[get(TN)==bucket_list[b]] + + # get the network -- only if there are enough rows... + # if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + # else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + # + # compute each parameter and update the table + ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ] + ACHR[b, new_wait_time1 := compute_wait_time1(df) ] + ACHR[b, new_wait_time2 := compute_wait_time2(df) ] + } + + + # Compute the alignment of the context factors + # ACHR$CF_AlignmentVR = as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR )) + + # Merge the results with the first row from each thread + print('Merging results...') + # VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) + VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID') + + save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata') + save(VRThrds, file=save_file_name) + + print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name)) + + return(VRThrds) +} +################################################################################## +# Helper functions +################################################################################# +compute_phase <- function(t){ + + t=as.Date(t) + + if ( t >= '2016-01-01' & t < '2016-06-07') return('one') + if ( t >= '2016-06-07' & t < '2016-09-01') return('two') + if ( t >= '2016-09-01' & t < '2017-04-15') return('three') + if ( t >= '2017-04-15' & t < '2017-09-01') return('four') + if ( t >= '2017-09-01' & t < '2017-12-31') return('five') + + # else return NA + return(NA) + +} +compute_thread_duration <- function(df){ + + return(difftime(lubridate::ymd_hms(df$tStamp[nrow(df)]), lubridate::ymd_hms(df$tStamp[1]), units='hours' )) + +} + +compute_visit_duration <- function(df){ + + # get the checkin time. Note that they sometimes look at patient data in advance of the visit, so you + # cannot use the first occurrences to mark the start of the thread + w1= grep('CHECKIN_TIME',df$Action)[1] + w2= grep('AVS_PRINT_TIME',df$Action)[1] + + return(difftime(lubridate::ymd_hms(df$tStamp[w2]), lubridate::ymd_hms(df$tStamp[w1]), units='hours' )) + +} + +# First wait time... from checkin until they get their vitals taken +compute_wait_time1 <- function(df){ + + # find the first occurrence of MR_VN_VITALS. Corresponds closely to wait time + # w= grep('MR_VN_VITALS',df$Action)[1] + + w1= grep('CHECKIN_TIME',df$Action)[1] + w2= grep('VITALS',df$Action)[1] + + return( max(0, as.numeric(difftime( lubridate::ymd_hms(df$tStamp[w2]),lubridate::ymd_hms(df$tStamp[w1]), units='hours' ) ))) + +} + +# Second wait time... from vitals until they see a doctor or other medical person +# NEW VERSION -- look for "AC_VISIT_NAVIGATOR" by a different role than "VITALS" +compute_wait_time2 <- function(df){ + + # find the first occurrence of MR_VN_VITALS. Get the time t1 + # w1= grep('MR_VN_VITALS',df$Action)[1] + w1= grep('VITALS',df$Action)[1] + + if (is.na(w1)) + return(NA) + else + { + t1 = lubridate::ymd_hms(df$tStamp[w1]) + + + # get the workstation and role used for the vitals at t1 + # work1 = df$Workstation[w1] + role1 = df$Role[w1] + + # get set of actions at the same workstation or different workstations that happen later in the visit + # Truncate the visit up to w1 + restdf = df[(w1+1):nrow(df),] + + # get the actions performed by other roles in the rest of the visit + differentRole = restdf[restdf$Role != role1,] + + # Now look for first instance of AC_VISIT_NAVIGATOR that is by one of the other roles + # first_AC_NAV = grep('AC_VISIT_NAVIGATOR',differentRole$Action)[1] + # t2 = lubridate::ymd_hms(differentRole$tStamp[first_AC_NAV]) + + # Try first instance of different role. period. + t2 = lubridate::ymd_hms(differentRole$tStamp[1]) + + + # return the difference in time from t1 to t2 + return( max(0, as.numeric(difftime( t2, t1, units='hours' ) ))) + } +} + +# Second wait time... from vitals until they see a doctor or other medical person +# compute_wait_time2 <- function(df){ +# +# # find the first occurrence of MR_VN_VITALS. Get the time t1 +# # w1= grep('MR_VN_VITALS',df$Action)[1] +# w1= grep('VITALS',df$Action)[1] +# +# if (is.na(w1)) +# return(NA) +# else +# { +# t1 = lubridate::ymd_hms(df$tStamp[w1]) +# +# +# # get the workstation and role used for the vitals at t1 +# work1 = df$Workstation[w1] +# role1 = df$Role[w1] +# +# # get set of actions at the same workstation or different workstations that happen later in the visit +# # Truncate the visit up to w1 +# restdf = df[w1:nrow(df),] +# +# # then get the set of action at the same workstation or different workstations (in that visit) +# sameWS = restdf[restdf$Workstation == work1,] +# differentWS = restdf[restdf$Workstation != work1,] +# +# # get list of other roles +# otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')), +# collapse = '|') +# +# # Now find the marker for when they see a physician +# # procedure is differenrt in BRKPT and HHPOB clinics +# if (df$Clinic[1] %in% c('BRKPT','HHPOB') ) +# { # LPN brings a different workstation on wheels at these clinics +# firstLPN = grep('LPN',differentWS$Role)[1] +# t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN]) +# +# # Other roles use the same workstation +# firstDoc = grep('Physician|Resident|Registered', sameWS$Role)[1] +# t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc]) +# } +# else +# { # a different role, using the same workstation +# w2 = max(w1, grep(otherRoles, sameWS$Role) )[1] +# t2 = lubridate::ymd_hms(sameWS$tStamp[w2]) +# } +# +# # return the difference in time from t1 to t2 +# return( difftime( t2, t1, units='hours' ) ) +# } +# } + + +# THIS ONE IS GOOD. +count_procedures <- function( p ){ + + # if no procedures, return zero + if (is.na( p )) return( 0 ) + + # get the overall number of items + total_num = str_count( p, '#@#')+1 + + # adjust for the visit codes that are not actual procedures + v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205') + + # just look at the first occurrence in the df + num_visit_codes = sum(str_count( p, v )) + + # limit to non-negative + total_num = max(0, total_num - num_visit_codes) + + return( total_num ) +} + +# This one is easy +count_diagnoses <- function(d){ + + # Sometimes the marker is missing, so set the floor to one. There is always at least one. + return( max(1, str_count( d, '#@#' ))) +} + +count_daily_procedures <- function( df ) { + + # get the string of procedures for that day + allproc = unlist(df[df$seqNum==1, 'Proc']) + + if (length(allproc)>0) + return( sum(sapply(allproc, function(p) {count_procedures(p)} )) ) + else return(0) + + +} + +# convert the LOS_CPT codes into a five point numeric scale +# operate on whole column to create new column +convert_LOS <- function(los_cpt){ + + # get the 5th character + s = sapply(los_cpt, function(cpt) { substring(as.character(cpt),5,5) } ) + + los = as.numeric(s) + + # convert 9 and NA to 1 + los[los==9]=NA + + +return(los) + +} + +################################################################### +################################################################### +# This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days +# occ = pre-processed threaded occurrences +# CFs can be chosen +ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) { + + library(tidyr) + library(dplyr) + library(ThreadNet) + library(ngram) + library(lubridate) + library(stringr) + library(data.table) + + # make a list of unique buckets. + bucket_list <- unique(occ[[TN]]) + N = length(bucket_list) + + # print the number of buckets + print(paste0('Number of buckets=', N )) + + # Add columns for combinations of CFs if needed + new_event_col = newColName(EVENT_CFs) + if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CFs,new_event_col) } + + all_cf_col = newColName(ALL_CFs) + if (!(all_cf_col %in% colnames(occ))) { occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } + + # this will speed up retrieving the subsets + setkeyv(occ, TN) + + # allocate the data table with N rows, then for speed, we use := to update it. + cds = data.table(Clinic_ymd = character(N), + Clinic = character(N), + ymd = character(N), + Phase = character(N), + NEvents = integer(N), + ClinicDayStart = character(N), + ClinicDayDuration =double(N), + NumVisits = integer(N), + NumUniqueDiagnosisGroups = integer(N), + NumPhysicians = integer(N), + TotalStaff = integer(N), + NetComplexity=double(N), + Nodes=double(N), + Edges=double(N), + CompressRatio = double(N), + Entropy = double(N), + ALL_CF_count = integer(N), + ALL_CF_entropy = double(N), + CF_Alignment = double(N)) + + # Now add columns for the CF counts. + for (cf in ALL_CFs){ + cds[, paste0(cf,"_countVR"):= double(N)] + # cds[, paste0(cf,"_compression"):= double(N)] + # cds[, paste0(cf,"_entropy"):= double(N)] + } + + + # make data frame with results + for (b in 1:N){ + + + if (b %% 100 ==0) print(paste0('Thread count = ',b)) + + + # select a subset of occurrences for the bucket + df = occ[ occ[[TN]] == bucket_list[b] ] + + + # get the network -- only if there are enough rows... + if (nrow(df)>2) n = threads_to_network_original(df,TN, new_event_col) + else n = list(edgeDF=t(c(0)),nodeDF=t(c(0))) + + # compute each parameter and put them in a vector + + cds[b,Clinic_ymd := bucket_list[b] ] + cds[b,Clinic := as.character( df$Clinic[1] ) ] + cds[b,ymd := as.character( df$ymd[1] ) ] + cds[b,Phase := compute_phase(df$tStamp[1]) ] + cds[b,threadNum := as.numeric(df[1,'threadNum']) ] + cds[b,NEvents := nrow(df) ] + cds[b,ClinicDayStart := as.character(lubridate::ymd_hms(df$tStamp[1])) ] + cds[b,ClinicDayDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units := 'hours' ) ] + cds[b,NumVisits := length(unique( df$Visit_ID )) ] + cds[b,NumUniqueDiagnosisGroups := length(unique( df$Diagnosis_Group )) ] + cds[b,NumPhysicians := length(unique( df$Physician )) ] + cds[b,TotalStaff := length(unique( df$Role_ID )) ] + cds[b, NetComplexity := estimate_network_complexity( n ) ] + cds[b,Nodes := nrow(n$nodeDF) ] + cds[b,Edges := nrow(n$edgeDF) ] + cds[b,CompressRatio := compression_index(df,new_event_col) ] + # cds[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ] + cds[b,Entropy := compute_graph_entropy( df[[new_event_col]]) ] + + cds[b,NumProceduresPerDay := count_daily_procedures(df) ] + cds[b,CF_Alignment := 1 ] # make placeholder, but compute below + cds[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ] + # cds[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] + cds[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]]) ] + + # Count the unique elements in each cf + for (cf in ALL_CFs){ cds[b, paste0(cf,"_countVR") := length(unique(df[[cf]])) ] } + + +} + + # Compute the alignment of the context factors + cds$CF_Alignment = as.numeric( as.character(cds$Action_count)) / as.numeric( as.character(cds$ALL_CF_count )) + + save(cds, file=paste0(paste('cds',TN,new_event_col,sep='+'), '.Rdata')) + + return(cds) + +} + +compute_graph_entropy <- function(s){ + + # guard against insufficient data... + if (length(s) <2) return(0) + + # convert s into text vector + text_vector = concatenate(s) + + # get the 2-grams in the sequence s. 'prop' is the proportion of each edge. It sums to 1. + p = get.phrasetable(ngram(text_vector,2))[['prop']] + + # return Shannon entropy + return(-sum(p*log(p))) + +} + +get_action_frequencies <- function(s,n){ + + # guard against insufficient data... + if (length(s) < n) return(0) + + # convert s into text vector + text_vector = concatenate(s) + + # get the 2-grams in the sequence s. 'freq' is the frequency + return( get.phrasetable(ngram(text_vector, n )) ) + +} + + diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R new file mode 100644 index 0000000..96800d3 --- /dev/null +++ b/Data Prep/ACHR_Batch_PreProcess.R @@ -0,0 +1,246 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets +# +# ACHR_Batch_PreProcess +# +# (c) 2018 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Brian Pentland +# Absolutely no warranty! +# +# Separate out the two basic functions +# 1) reading/cleaning the occurrences +# 2) Threading the data by adding ThreadNum and SeqNum to the threads +# +########################################################################################################## +library(tidyr) +library(data.table) +library(dplyr) +library(ThreadNet) +library(ngram) +library(lubridate) + +redo_ACHR_data_from_scratch <- function(fname){ + + setwd("~/Documents/@NSF Healthcare Routines/Deidentified_Derm_data_Nov_2018") + + fname = 'audit_111818' + + # Pick you point of view, or multiple POV... + THREAD_CF = c('Visit_ID') + # EVENT_CF = c('Action','Role','Workstation') + EVENT_CF = c('Action') + ALL_CF = c('Action','Role','Workstation') + + # first read the data + o = read_ACHR_data( fname ) # nrow(o) = 7728927 + + # fix the role IDs -- needs file called RoleChangeTable.csv in the local directory + o = fix_derm_role_ID(o) + + # Thread occurrences adds threadNum and seqNum to each thread, as defined by the the thread_CF. + # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role) + # make two versions -- one for whole visits and one for visit_ID_Role + ot = thread_occurrences( o, 'Visit_ID' ,fname ) # nrow(ot)= 57835 + # otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666 + otrw = thread_occurrences( o, c('Visit_ID','Role','Workstation') ,fname ) # nrow(otrw) > 527666 + + + # Now aggregate by clinic-day for each POV + clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname) + + # Aggregate by visit for each POV + # Whole visits + visits = ACHR_batch_visits(ot, THREAD_CF, EVENT_CF, fname) + + # And then by role + VRThreads <- ACHR_batch_visit_role_threads(otr, c('Action'), c('Action','Role','Workstation') , visits) + + + # compute trajectories to see difference from a reference graph + traj = graph_trajectory( ot, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory') + +} + +# this function reads the raw data from URMC +# It returns the data frame and also saves it as an Rdata file with the same name as the csv +# Tested Nov 16. +read_ACHR_data <- function(fname){ + + + + + # This code is tailored for reading in new data from URMC, October 2018 + # Assumes there is a column called 'Timestamps" in column 13 and "Visit_ID" + # read the file into data.table + d <- fread( paste0(fname, '.csv') ) + + # Sort by visit and timestamps (will do this later with setkey) + d <- arrange(d,desc(Visit_ID,asc(Timestamps))) + + # This stuff is URMC specific + # + # move and rename the columns, as needed + # Mainly need to get the Timestamps into the first column and rename them tStamp + cn=colnames(d) + cnx = cn[c(13,1:12,14:15)] + d=setcolorder(d,cnx) + setnames(d,'Timestamps','tStamp') + setnames(d,'V1','seqn') + + # Fix the name of the Highland clinic + d$Clinic = gsub('HH POB','HHPOB',d$Clinic) + + + # This converts numbers to char and replaces spaces with underscore + d = cleanOccBatch(d) + + ### This is URMC specific: Add the clinic and clinic_day and fix the code for the highland clinic + + # get the date only from the timestamp + d$ymd <- format( as.POSIXct(d$tStamp, format="%Y-%m-%d"), "%Y-%m-%d" ) + + # make new columns for clinic + day and events + d = unite(d, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false') + + + # Save the result + save_file_name = paste0(fname, '.rData') + save(d, file=save_file_name) + print(paste("Saved ",nrow(d), " occurrences in ",save_file_name)) + + return(d) + +} + +# clean up the raw occurrence data +# Remove blanks for n-gram functionality +cleanOccBatch <- function(fileRows){ + + # extract tStamp + tStamp <- fileRows$tStamp + + # First get rid of any underscores pre-existing in the data, then + # confirm all spaces are converted to underscores in non tStamp columns; set as factors + cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub(" ","_",x) }) ) + + # bind tStamp back to cleaned data + complete <- as.data.table(cbind(tStamp,cleanedCF)) + + # Old code for forcing tStamp into a "YMD_HMS" format -- not needed for URMC EMR data + # complete$tStamp <- as.character(complete$tStamp) + # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM")) + + # add weekday and month + # complete$weekday <- as.factor(weekdays(as.Date(complete$tStamp))) + # complete$month <- as.factor(months(as.Date(complete$tStamp))) + + return(complete) +} + +####################################################################################### +# In the derm data, the role "technician" was used for people who were not technicians +# Pass in the table of occurrences, fix it, and return it +# read in the list of changes +fix_derm_role_ID <- function(o){ + + # read in the new role ID + rc = read.csv('RoleChangeTable.csv', stringsAsFactors = FALSE) + + + # occurrences are a data.table so use := + for (r in 1:nrow(rc)) { + + old_rid = rc$OLD_Role_ID[r] + new_rid = rc$NEW_Role_ID[r] + new_role = rc$NEW_Role[r] + + # pick out rows with old Role_ID + # old_rid_vector = o$Role_ID==old_rid + # + # o[old_rid_vector, Role_ID := new_rid] + # o[old_rid_vector, Role := new_role] + + # crazy data table syntax... do it all in one shot + o[o$Role_ID==old_rid, ':=' (Role = new_role, Role_ID = new_rid ) ] + + } + + return(o) +} + + + + +######################################################################################### +# This function adds columns for the the thread, as requested ( Tested Nov 16 ) +# Then it sorts by thread and tStamp and adds thread/sequence numbers to each thread. +# +# Typical column names from the UMRC file +# ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group') +# thread_CF = c('Visit_ID', 'Role', 'Workstation') +# event_CF = c('Action','Role','Workstation') +# It returns the threaded set of occurrences and also saves it as Rdata. +thread_occurrences <- function(occ, THREAD_CF, fname='emr'){ + + print(paste('Number of occurrences: ', nrow(occ))) + + # these will be new column names + new_thread_col = newColName(THREAD_CF) + # new_event_col = newColName(EVENT_CF) + + print(paste('new_thread_col: ',new_thread_col)) + # print(paste('new_event_col: ', new_event_col)) + + # Add names for the new columns, if necessary + if (!(new_thread_col %in% colnames(occ))) { occ = combineContextFactors(occ,THREAD_CF,new_thread_col) } + # if (!(new_event_col %in% colnames(occ))) { occ = combineContextFactors(occ,EVENT_CF,new_event_col) } + + # ThreadNet code assumes these columns will be there, so we need to add them + occ$threadNum = integer(nrow(occ)) + occ$seqNum = integer(nrow(occ)) + + # Assume the data are sorted by visit_ID and tStamp -- the natural flow of the visit. + idx_list = which(occ[[new_thread_col]] !=dplyr::lag(occ[[new_thread_col]])) + idx_list = c(idx_list, nrow(occ)) + + # creates two lists of indices so they match up correctly. + start_idx = c(1, head(idx_list,-1)) # add 1 to the front and drop the last + end_idx = idx_list - 1 # shift back by one + + print(paste('Number of threads in this POV: ', length(idx_list))) + + # sapply(seq(length(idx_list)), function(x) + for (x in seq(length(idx_list))) + { if (x %% 1000 ==0) print(paste0('Thread count = ',x)) + occ[start_idx[x]:end_idx[x],'threadNum' := x ] + occ[start_idx[x]:end_idx[x],'seqNum':= c(1:(end_idx[x]-start_idx[x]+1)) ] + } + + # Now save the result for later and return it, as well. + # save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata')) + + save_file_name = paste0(paste0(paste(fname,new_thread_col,sep='+'), '.rData')) + save(occ, file=save_file_name) + + print(paste('Saved threaded occurrences: ', nrow(occ), " in ",save_file_name)) + + return(occ) +} + + + + ############ extra stuff not used ############# + # split occ data frame by threadNum to find earliest time value for that thread + # # then substract that from initiated relativeTime from above + # occ_split = lapply(split(occ, occ$threadNum), + # function(x) {x$relativeTime = difftime(x$relativeTime, min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x}) + # + # # # row bind data frame back together + # occ= data.frame(do.call(rbind, occ_split)) + + + + + + diff --git a/Data Prep/ACHR_Batch_V2.R b/Data Prep/ACHR_Batch_V2.R index 730f0b5..818ef08 100644 --- a/Data Prep/ACHR_Batch_V2.R +++ b/Data Prep/ACHR_Batch_V2.R @@ -35,7 +35,7 @@ # this function reads the raw data from URMC and creates files for clinic_day and visits -read_ACHR_data <- function(){ +read_ACHR_data_old <- function(){ library(tidyr) library(data.table) @@ -48,7 +48,9 @@ read_ACHR_data <- function(){ # This code is tailored for reading in new data from URMC, October 2018 # read the file into data frame - d <<- fread('auditfinal_10022018.csv') + d <<- fread('auditfinal_10022018.csv') + + # d <<- fread('10test.csv') # Sort and convert to data.table vt <<-data.table(arrange(d,desc(Visit_ID,asc(Timestamps)))) @@ -58,7 +60,7 @@ read_ACHR_data <- function(){ cnx = cn[c(13,1:12,14:15)] vt=setcolorder(vt,cnx) setnames(vt,'Timestamps','tStamp') - setnames(vt,'X','seqn') + # setnames(vt,'X','seqn') # make a dataframe copy just for fun vdf <<- as.data.frame(vt) diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R index a404eec..bb80ee7 100644 --- a/Data Prep/ACHR_Batch_V3.R +++ b/Data Prep/ACHR_Batch_V3.R @@ -271,19 +271,23 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { ACHR[b,A_Nodes := nrow(n$nodeDF) ] ACHR[b,A_Edges := nrow(n$edgeDF) ] - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] + # n = threads_to_network_original(df,TN, DV2) + # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + # ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + # ACHR[b,AR_Edges := nrow(n$edgeDF) ] + # + # n = threads_to_network_original(df,TN, DV3) + # ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + # ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + # ACHR[b,ARW_Edges := nrow(n$edgeDF) ] - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + # # get the entropy for AR and ARW + # ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + # ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + ACHR[b, AR_Entropy := compute_graph_entropy_TEST(df[[DV2]]) ] + ACHR[b, ARW_Entropy := compute_graph_entropy_TEST(df[[DV3]]) ] # compute stuff on each context factor @@ -296,7 +300,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + + # get the graph entropy + ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST(df[[cf]]) ] } } # kf nrows > 2 @@ -327,8 +334,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) { return(ACHR) } -################################################################### -################################################################### +################################################################## +################################################################## +################################################################## +################################################################## @@ -373,6 +382,7 @@ ACHR = data.table(bucket=integer(N), A_Entropy = double(N), AR_Entropy = double(N), ARW_Entropy = double(N), + CF_Alignment = double(N), Visit_ID = character(N), Subject_ID = character(N), Clinic = character(N), @@ -420,8 +430,8 @@ for (i in 1:N){ # compressibility of DV ACHR[b,A_CompressRatio := compression_index(df,DV1)] - ACHR[b,AR_CompressRatio := compression_index(df,DV2)] - ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] + # ACHR[b,AR_CompressRatio := compression_index(df,DV2)] + # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)] # NetComplexity of DV # First get the network @@ -432,20 +442,23 @@ for (i in 1:N){ ACHR[b,A_Nodes := nrow(n$nodeDF) ] ACHR[b,A_Edges := nrow(n$edgeDF) ] - n = threads_to_network_original(df,TN, DV2) - ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] - ACHR[b,AR_Nodes := nrow(n$nodeDF) ] - ACHR[b,AR_Edges := nrow(n$edgeDF) ] - - n = threads_to_network_original(df,TN, DV3) - ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] - ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] - ACHR[b,ARW_Edges := nrow(n$edgeDF) ] + # n = threads_to_network_original(df,TN, DV2) + # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )] + # ACHR[b,AR_Nodes := nrow(n$nodeDF) ] + # ACHR[b,AR_Edges := nrow(n$edgeDF) ] + # + n = threads_to_network_original(df,TN, DV3) + ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )] + ACHR[b,ARW_Nodes := nrow(n$nodeDF) ] + ACHR[b,ARW_Edges := nrow(n$edgeDF) ] - # get the entropy for AR and ARW - ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] - ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + # # get the entropy for AR and ARW + # ACHR[b, AR_Entropy := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ] + # ACHR[b, ARW_Entropy := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ] + # ACHR[b, AR_Entropy := compute_graph_entropy_TEST( df[[DV2]] ) ] + ACHR[b, ARW_Entropy := compute_graph_entropy_TEST( df[[DV3]] ) ] + # # compute stuff on each context factor for (cf in CFs){ @@ -457,8 +470,9 @@ for (i in 1:N){ ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ] # get the entropy - ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] - + # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ] + ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST( df[[cf]]) ] + } } # kf nrows > 2 @@ -480,6 +494,8 @@ for (i in 1:N){ ACHR[b,'Weekday' := df[1,'Weekday']] ACHR[b,'Month' := df[1,'Month']] + # Compute the alignment of the context factors + ACHR[b, 'CF_Alignment' := ACHR[b,Action_count] / ACHR[b,ARW_Nodes] ] } # loop thru buckets @@ -491,6 +507,168 @@ ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ] return(ACHR) } +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ + + +# ACHR stands for Antecedents of Complexity in Healthcare Routines. +# This is function is set up to compute process parameters on thousands of patient visits. +# This version assumes that CFs = c('Action','Role','Workstation') +ACHR_batch_visits_all_CFs <- function(occ,TN, CFs) { + + # Get list of columns we want to use/create + dv1=newColName(CFs[1]) + dv2=newColName(CFs[2]) + dv3=newColName(CFs[3]) + dv4=newColName(CFs[2:3]) + dv5=newColName(CFs[1:2]) + dv6=newColName(CFs[1:3]) + + # need to make new columns + occ = combineContextFactors(occ,CFs[2:3], newColName(CFs[2:3])) + occ = combineContextFactors(occ,CFs[1:2], newColName(CFs[1:2])) + occ = combineContextFactors(occ,CFs[1:3], newColName(CFs[1:3])) + + # We will loop thru this list below + dv_list = c(dv1, dv2, dv3, dv4, dv5, dv6) + print(dv_list) + + # set key on the data.table for the threadNum to speed retrival + occ=as.data.table(occ) + setkeyv(occ, TN) + + # pick subsets -- one visit at a time in this version, but could be more + bucket_list <- make_buckets_1(occ, 'threadNum') + + # get the size (number of buckets) + N = length(bucket_list) + # print(bucket_list) + + + # pre-allocate the data.table. Tables are supposed to be faster. + ACHR = data.table(bucket=integer(N), + NEvents = integer(N), + VisitStart= character(N), + VisitDuration=double(N), + CF_Alignment = double(N), + Visit_ID = character(N), + Subject_ID = character(N), + Clinic = character(N), + LOS_CPT = character(N), + NumProcedures = double(N), + NumDiagnoses = double(N), + Procedure = character(N), + Diagnosis = character(N), # diag in the raw data + Diagnosis_group = character(N), + Physician = character(N), + Weekday = character(N), + Month = character(N) + ) + + # Now add columns for each dv. + for (dv in dv_list){ + + ACHR[, paste0(dv,"_nodes"):= double(N)] + ACHR[, paste0(dv,"_edges"):= double(N)] + ACHR[, paste0(dv,"_complexity"):= double(N)] + ACHR[, paste0(dv,"_entropy"):= double(N)] + + } + + # loop through the buckets. Result will be data frame with one row per bucket + for (i in seq(N,1,-1)){ + + b = i # as.integer(bucket_list[i]) + + # bucket number + ACHR[b,bucket := b] + + # print once every 100 visits + # print(b) + # print (bucket_list[i]) + + # save data every 1000 visits + if (b%%1000==0) { + print(b) + ACHR_partial = ACHR[ACHR$NEvents>1,] + save(ACHR_partial, file=paste0("visits_ARW_",nrow(ACHR_partial),'.Rdata'))} + + # select a subset + df= subset(occ, threadNum==bucket_list[i] ) + + # only do the computations if there are more than two occurrences + if (nrow(df) > 2) { + + # make sure it is sorted by timestamp + df=df[order(df$tStamp),] + + + # length of the thread (number of rows) + ACHR[b,NEvents := nrow(df)] + # print( nrow(df) ) + + + # compute the duration of the visit in hours + ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)), min(lubridate::ymd_hms(df$tStamp)), units='hours') ] + + # compute stuff on each DV in the list + for (dv in dv_list){ + + n = threads_to_network_local(df,TN, dv) + + ACHR[b,paste0(dv,"_complexity") := estimate_network_complexity( n )] + ACHR[b,paste0(dv,"_nodes") := nrow(n$nodeDF) ] + ACHR[b,paste0(dv,"_edges") := nrow(n$edgeDF) ] + ACHR[b, paste0(dv,"_entropy") := compute_graph_entropy( df[[dv]] ) ] + + # print (dv) + # print( nrow(n$nodeDF) ) + # print( length(unique(df[[dv]]))) + + } + } # df nrows > 3 + + # Now copy in the rest of data + # this only works because one visit is one bucket + ACHR[b,'VisitStart' := as.character(df[1,'tStamp']) ] + + ACHR[b,Visit_ID := df[1,'Visit_ID']] + ACHR[b,Subject_ID := df[1,'Subject_ID']] + ACHR[b,Clinic := df[1,'Clinic']] + ACHR[b,LOS_CPT := df[1,'LOS_CPT']] + ACHR[b,Procedure := df[1,'Proc']] + ACHR[b,Diagnosis := df[1,'Diag']] + ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ] + ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ] + ACHR[b,Diagnosis_group := df[1,'Diagnosis_Group']] + ACHR[b,Physician := df[1,'Physician']] + ACHR[b,Weekday := df[1,'Weekday']] + ACHR[b,Month := df[1,'Month']] + + # Compute the alignment of the context factors + # This assumes CFs = c('Action','Role','Workstation') + # print( ACHR[b,Action_nodes] ) + # print( ACHR[b,Action_Role_Workstation_nodes] ) + ACHR[b, 'CF_Alignment' := ACHR[b,Action_nodes] / ACHR[b,Action_Role_Workstation_nodes] ] + + } # loop thru buckets + + # save the data one last time + save(ACHR, file=paste0("visits_ARW_",b,'.Rdata')) + + # return the table + return(ACHR) +} + + +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ +##################### ############################################################ + # Each bucket is a list of thread numbers that can be used to subset the list of occurrences make_buckets_1 <- function(o, criteria){ @@ -539,3 +717,100 @@ cleanOccBatch <- function(fileRows){ get_timeScale <- function(){'hr'} +# create a function to compute graph entropy +# It will use the standard Shannon entropy, but it will apply it to the frequencies of edges in the +# adjacency matrix. To do this, we just need to count the 2-grams in the sequence and divide by the total +# number of 2-grams to get the probabilities. You should be able to pass in any sequence. +# NOTE: This version will work on one sequence at a time (eg. one patient visit) +# Needs to be revised to work on groups of visits (e.g,. clinic-days) +compute_graph_entropy_TEST <- function(s){ + + # first convert s into text vector + text_vector = long_enough( concatenate(s) , 2, ' ') + # text_vector = concatenate(s) + + # if (length(text_vector)<2) {return(0)} + + # get the 2-grams in the sequence s. ng$prop is the proportion of each edge. It sums to 1. + p = get.phrasetable(ngram(text_vector,2))[['prop']] + + # return Shannon entropy + return(-sum(p*log(p))) + +} +# to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n +# this gets used in various places so need to pass in the delimiter +long_enough = function(tv,n,delimiter){ + + return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ]) + +} + +threads_to_network_local <- function(et,TN,CF,grp='threadNum'){ + + # print(head(et)) + # + # print(paste('CF=', CF)) + # print(paste('grp=', grp)) + + # First get the node names & remove the spaces + node_label = levels(factor(et[[CF]])) # unique(et[[CF]]) + node_label=str_replace_all(node_label," ","_") + nNodes = length(node_label) + + # print("node_label") + # print(node_label) + # print(paste('nNodes=', nNodes)) + + node_group=character() + for (n in 1:nNodes){ + # hardcoded threadNum for data.table syntax + node_group = c(node_group, as.character(unlist( et[which(et[[CF]]==node_label[n]),threadNum][1]) ) ) + } + + # set up the data frames we need to draw the network + nodes = data.frame( + id = 1:length(node_label), + label = node_label, + Group = node_group, + title=node_label) + + # get the 2 grams for the edges + ngdf = count_ngrams(et,TN, CF, 2) + + # Adjust the frequency of the edges to 0-1 range + ngdf$freq = round(ngdf$freq/max(ngdf$freq),3) + + # need to split 2-grams into from and to + from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2) + + # need to find a better way to do this... + nEdges = length(from_to_str) + # from_labels=matrix(data="", nrow=nEdges,ncol=1) + # to_labels =matrix(data="", nrow=nEdges,ncol=1) + # from=integer(nEdges) + # to=integer(nEdges) + # for (i in 1:length(from_to_str)){ + # + # # Get from and to by spliting the 2-gram + # from_labels[i] = str_split(from_to_str[[i]]," ")[1] + # to_labels[i] = str_split(from_to_str[[i]]," ")[2] + # + # # use match to lookup the nodeID from the label... + # from[i] = match(from_labels[i], nodes$label) + # to[i] = match(to_labels[i], nodes$label) + # } + # + # Stopped filtering out selfies July 20, 2019 for Kerstin Sailer bug report + edges = data.frame( + # from, + # to, + label = ngdf$freq, + Value =ngdf$freq) # %>% filter(!from==to) + + # print(paste("T2N nodes:",nodes)) + # print(paste("ngdf = :",ngdf)) + # print(paste("edges= :",edges)) + + return(list(nodeDF = nodes, edgeDF = edges)) +} diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R index 48525c5..56376e1 100644 --- a/Data Prep/ACHR_Clinic_Trajectories_v3.R +++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R @@ -8,6 +8,11 @@ # these functions will take clinic data and produce trajectories (distance of the graph from a reference graph) +# need to rethink how this selects occurrences in buckets. +# Should buckets always be by clinic_day?? This allows us to track what happens in each clinic over time. + +# We want to track roles over time in each clinic. So it's Clinic_day and also visit_role. We have threads by visit_role + # start with auditfinal_10022018.csv --> use ACHR_Batch_v2 to make new_occ. # The code here starts with new_occ (~ 7.72m occurrences that we break down into 1857 clinic_days and 57835 visits) @@ -22,20 +27,11 @@ library(philentropy) library(stringr) library(zoo) -# need function to count the number of procedures and number of diagnoses in the data. -# some visits have dozens of them -# need to pass in data frame that just has the column with the procedures or diagnoses -count_procs <- function(o){ - print(nrow(o)) - -unlist(lapply(1:nrow(o), function(i){ str_count(o[i], pattern='#@#') })) - -} # Use this to put the trajectory data with the clinic_day data merge_clinic_days_and_trajectories <- function(){ - clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date) + # clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date) cds=merge(x=test0,y=clinic_days,by.x='Clinic_ymd',by.y ='Clinic_date' ,all=TRUE) } @@ -58,31 +54,158 @@ get_bucket <- function(o, b ){ # @param e data frame for POV -# @param blist is the bucket list. Each bucket is a "window" +# @param bucket_CFs is the column used to define the bucket list, or the combination of columns needed to create the bucket. Each bucket is a "window" # @param cf is the column that defines events (e.g. 'Action_Role') -# @param filter list contains the threshold levels for filtering out edges with low frequency: c(0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1) -graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_file_name) { +# @param filter list contains the threshold level for filtering out edges with low frequency. +# +# MAKE SURE YOU ARE READING IN A SET OF OCCURRENCES THAT HAS THE CORRECT THREADS... +# +graph_trajectory <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { - # get total size of possible matrix - Max_Order = length(unique(e[[cf]]))^2 + # add column for bucket_CF if needed + bucket_col = newColName(bucket_CFs) + if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(e,bucket_CFs,bucket_col) } + # here is the sorted list of buckets + blist = sort(unique(e[[bucket_col]])) + + # now many buckets? + nb = length(blist) + # print(blist) + print(paste('Number of buckets =',nb)) # make data frame for results - vt=data.frame( ngrams=character(), freq=integer(), bid=integer() ) + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) - # here is the list of buckets - blist = unique(e[['Clinic_ymd']]) + bcount=0 + # scan through the data + for (b in blist){ + bcount= bcount +1 + # print(paste('b =',b)) + + # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... + # get text vector for the whole data set. Bucket needs at least 2 threads + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[[bucket_col]] ==b , ] + + if (nrow(th)>3) + { + ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] + + + nodes = length(unique(th[[cf]])) + + + # print(paste('nrow ngdf =',nrow(ngdf))) + + # add the bucket number and name + ngdf$bid = bcount + ngdf$bname = b + + # append the columns to the end + # vt is the whole set of all ngrams in all the windows + vt=rbind(vt,ngdf) + } + } + + + # convert to factor + vt$ngrams = factor(vt$ngrams) + + # nWindows = length(unique(vt$bid)) + nWindows = nb + #print(paste('nWindows =',nWindows)) + + # get the set of unique ngrams for the whole data set + vt_unique = data.frame(ngrams=unique(vt$ngrams)) + #print(vt_unique) + + # put the results here + windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique)) + + for (i in 1:nWindows){ + + # get the merged listunqi + vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) + + # use the bid.y to get the whole vector, but replace the NA with zeros + bb=vtmerge[vtmerge$bid==i,'freq'] + bb[is.na(bb)] <- 0 + + windowFreqMatrix[i,]=bb + + print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0))) + + } + + # #Make a matrix to filter by where all the entries are 0 < f < 1 + # fm = windowFreqMatrix/max(windowFreqMatrix) + # + # # if the number if below the threshold, set it to zero + # windowFreqMatrix[fm<=filter_threshold] = 0 - # take the underscore out so we can use it to strsplit below... and sort it - blist=gsub('HH_POB','HHPOB',blist) - blist = sort(blist) + df = data.table( + Clinic_ymd = character(nWindows), + Clinic = character(nWindows), + # Role_ymd = character(nWindows), + # Role = character(nWindows), + ymd = character(nWindows), + pct_retained = double(nWindows), + complexity = double(nWindows), + Dist_from_reference = double(nWindows), + Dist_from_next = double(nWindows) + ) + for (i in 1:(nWindows-1) ) { + + df[i, Clinic_ymd := blist[i] ] + df[i, Clinic := unlist(strsplit(blist[i],'_',fixed=TRUE))[1] ] + df[i, ymd := unlist(strsplit(blist[i],'_',fixed=TRUE))[2] ] + + # df[i, Role_ymd := as.character(blist[i]) ] + # df[i, Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ] + # df[i, ymd := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ] + + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] + df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] + df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] + } + + + # get the ngram data and labels + if (keep_ngram_vectors) { + b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + colnames(b_df)=vt_unique$ngrams + + # stick the ngram frequencies on the end for good measure + df = cbind(df,b_df) + } + + # save the result and return it, too + save(df, file=paste0(save_file_name,'.rData') ) + + return(df) + +} +################################################################################## + +graph_trajectory_ROLES <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { + + # add column for bucket_CF if needed + bucket_col = newColName(bucket_CFs) + if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(occ,bucket_CFs,bucket_col) } + + # here is the sorted list of buckets + blist = sort(unique(e[[bucket_col]])) + # now many buckets? nb = length(blist) - print(blist) - print(paste('nb =',nb)) + # print(blist) + print(paste('Number of buckets =',nb)) - complexity_idx=list(nb) + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) bcount=0 # scan through the data @@ -90,9 +213,13 @@ graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_ bcount= bcount +1 # print(paste('b =',b)) - # get text vector for the whole data set. Bucket needs at least 2 threads - th = get_bucket(e, b) - if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, 2)[1:2] } + # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... + # get text vector for the whole data set. Bucket needs at least 2 threads + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[[bucket_col]] ==b , ] + + if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } + nodes = length(unique(th[[cf]])) @@ -101,13 +228,12 @@ graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_ # add the bucket number and name ngdf$bid = bcount - # ngdf$bname = b + ngdf$bname = b # append the columns to the end # vt is the whole set of all ngrams in all the windows vt=rbind(vt,ngdf) } - print(head(complexity_idx)) # convert to factor @@ -126,7 +252,7 @@ graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_ for (i in 1:nWindows){ - # get the merged list + # get the merged listunqi vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) # use the bid.y to get the whole vector, but replace the NA with zeros @@ -139,53 +265,272 @@ graph_trajectory_filtered <- function(e, cf, reference_day=1, filter_list,save_ } - #Make a matrix to filter by where all the entries are 0 < f < 1 - fm = windowFreqMatrix/max(windowFreqMatrix) - - # Now filter out the edges - for (f in filter_list) { - - # if the number if below the threshold, set it to zero - windowFreqMatrix[fm<=f] = 0 - - - # correlate each row with the first one stick it in a dataframe - df =data.frame(window=1:(nWindows-1), - Clinic_ymd = blist[1:(nWindows-1)], - Clinic= unlist(lapply(1:(nWindows-1), - function(i){ unlist(strsplit(blist[i],'_'))[1] })), - ymd=unlist(lapply(1:(nWindows-1), - function(i){ unlist(strsplit(blist[i],'_'))[2] })), - pct_retained = unlist(lapply(1:(nWindows-1), - function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/ - sum(windowFreqMatrix[i,]>0) - })) , - pct_possible = unlist(lapply(1:(nWindows-1), - function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) , - complexity = unlist(lapply(1:(nWindows-1), - function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) , - Dist_from_reference= unlist(lapply(1:(nWindows-1), - function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), - method='cosine' ) })), - Dist_from_next= unlist(lapply(1:(nWindows-1), - function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), - method='cosine' ) })) - ) - - save(df, file=paste0(save_file_name,f,'.rData') ) - - } # f in filter list - - return(df) + # #Make a matrix to filter by where all the entries are 0 < f < 1 + # fm = windowFreqMatrix/max(windowFreqMatrix) + # + # # if the number if below the threshold, set it to zero + # windowFreqMatrix[fm<=filter_threshold] = 0 - # get the ngram data and labels - # b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) - # colnames(b_df)=vt_unique$ngrams - # - # # stick the ngram frequencies on the end for good measure - # return(cbind(df,b_df)) + df = data.table( + # Clinic_ymd = character(nWindows), + # Clinic = character(nWindows), + Role_ymd = character(nWindows), + Role = character(nWindows), + ymd = character(nWindows), + pct_retained = double(nWindows), + complexity = double(nWindows), + Dist_from_reference = double(nWindows), + Dist_from_next = double(nWindows) + ) + + for (i in 1:(nWindows-1) ) { + + # df[i, Clinic_ymd := blist[i] ] + # df[i, Clinic := unlist(strsplit(blist[i],'_'))[1] ] + # df[i, ymd := unlist(strsplit(blist[i],'_'))[2] ] + + df[i, Role_ymd := as.character(blist[i]) ] + df[i, Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ] + df[i, ymd := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ] + + + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] + df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] + df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] + } + + + # get the ngram data and labels + if (keep_ngram_vectors) { + b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + colnames(b_df)=vt_unique$ngrams + + # stick the ngram frequencies on the end for good measure + df = cbind(df,b_df) + } + + # save the result and return it, too + save(df, file=paste0(save_file_name,'.rData') ) + + return(df) } +################################################################################## +# e = otr, needs to be set up to include threads that are visit_ID_Role +# there must be a 'threadNum" assigned in each row +# +# cf is the context factor we want to follow, such as 'Action' + + +get_ngrams_for_buckets <- function(e, cf, n_gram_size=2, filter_threshold=2, save_file_name='deleteme') { + + + + # here is the sorted list of buckets + blist = sort(unique(e$threadNum)) + + + # now many buckets? + nb = length(blist) + # print(blist) + print(paste('Number of buckets =',nb)) + + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) + + bcount=0 + # scan through the data + for (b in blist){ + bcount= bcount +1 + # print(paste('b =',b)) + + + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[['threadNum']] ==b , ] + + if (nrow(th)>filter_threshold) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] + + # add the bucket number and name + ngdf$bid = bcount + ngdf$bname = b + + # append the columns to the end + # vt is the whole set of all ngrams in all the windows + vt=rbind(vt,ngdf) + } + } + + + # convert to factor + vt$ngrams = factor(vt$ngrams) + + # nWindows = length(unique(vt$bid)) + nWindows = nb + #print(paste('nWindows =',nWindows)) + + # get the set of unique ngrams for the whole data set + vt_unique = data.frame(ngrams=unique(vt$ngrams)) + #print(vt_unique) + + # put the results here + windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique)) + + for (i in 1:nWindows){ + + # get the merged listunqi + vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) + + # use the bid.y to get the whole vector, but replace the NA with zeros + bb=vtmerge[vtmerge$bid==i,'freq'] + bb[is.na(bb)] <- 0 + + windowFreqMatrix[i,]=bb + + print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0))) + + } + + + # get the ngram data and labels + # convert matrix to data frame + # df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + df=as.data.frame(windowFreqMatrix) + + # add the columnnames + colnames(df)=vt_unique$ngrams + + # add the bucket names + df = cbind(as.data.frame(blist), df) + colnames(df)[1] <- c("threadNum") + + + + # save the result and return it, too + #save(df, file=paste0(save_file_name,'.rData') ) + + return(df) + +} +################################################################################## +graph_trajectory_week <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') { + + # add column for bucket_CF if needed + bucket_col = newColName(bucket_CFs) + if (!(bucket_col %in% colnames(e))) { e = combineContextFactors(occ,bucket_CFs,bucket_col) } + + # here is the sorted list of buckets + blist = sort(unique(e[[bucket_col]])) + + # now many buckets? + nb = length(blist) + # print(blist) + print(paste('Number of buckets =',nb)) + + # make data frame for results + vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() ) + + bcount=0 + # scan through the data + for (b in blist){ + bcount= bcount +1 + # print(paste('b =',b)) + + # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE... + # get text vector for the whole data set. Bucket needs at least 2 threads + # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs. + th = e[ e[[bucket_col]] ==b , ] + + if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2] } + + + nodes = length(unique(th[[cf]])) + + + # print(paste('nrow ngdf =',nrow(ngdf))) + + # add the bucket number and name + ngdf$bid = bcount + ngdf$bname = b + + # append the columns to the end + # vt is the whole set of all ngrams in all the windows + vt=rbind(vt,ngdf) + } + + + # convert to factor + vt$ngrams = factor(vt$ngrams) + + # nWindows = length(unique(vt$bid)) + nWindows = nb + #print(paste('nWindows =',nWindows)) + + # get the set of unique ngrams for the whole data set + vt_unique = data.frame(ngrams=unique(vt$ngrams)) + #print(vt_unique) + + # put the results here + windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique)) + + for (i in 1:nWindows){ + + # get the merged list + vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE) + + # use the bid.y to get the whole vector, but replace the NA with zeros + bb=vtmerge[vtmerge$bid==i,'freq'] + bb[is.na(bb)] <- 0 + + windowFreqMatrix[i,]=bb + + print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0))) + + } + + # #Make a matrix to filter by where all the entries are 0 < f < 1 + # fm = windowFreqMatrix/max(windowFreqMatrix) + # + # # if the number if below the threshold, set it to zero + # windowFreqMatrix[fm<=filter_threshold] = 0 + + df = data.table( + Clinic_week = character(nWindows), + Clinic = character(nWindows), + ymd = character(nWindows), + pct_retained = double(nWindows), + complexity = double(nWindows), + Dist_from_reference = double(nWindows), + Dist_from_next = double(nWindows) + ) + + for (i in 1:(nWindows-1) ) { + df[i, Clinic_week := blist[i] ] + # df[i, Clinic_ymd := blist[i] ] + # df[i, Clinic := unlist(strsplit(blist[i],'_'))[1] ] + # df[i, ymd := unlist(strsplit(blist[i],'_'))[2] ] + df[i, pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ] + df[i, complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ] + df[i, Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ] + df[i, Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ] + } + + + # get the ngram data and labels + if (keep_ngram_vectors) { + b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),]) + colnames(b_df)=vt_unique$ngrams + + # stick the ngram frequencies on the end for good measure + df = cbind(df,b_df) + } + + # save the result and return it, too + save(df, file=paste0(save_file_name,'.rData') ) + + return(df) + +} \ No newline at end of file diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R new file mode 100644 index 0000000..4c962f9 --- /dev/null +++ b/Data Prep/Comparison_functions.R @@ -0,0 +1,50 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets + +# (c) 2017 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Absolutely no warranty! +########################################################################################################## + + +library(dplyr) + + + + +# o is the list of occurrences +# m is the metric we want to see +routine_metric <- function(o, cf, m){ + + switch(m, + len = nrow(o), + lex_size = length(unique(o[[cf]])), + first_order = table(o[[cf]]), + first_order_sort = sort( table(o[[cf]]), decreasing = T) + + ) + +} + +get_range <- function(o, start_date, end_date){ + + o %>% filter(as.Date(tStamp) > start_date & as.Date(tStamp) < end_date) + +} + + +# get start dates -- just a place to stick some code +# April 2019 + +get_start_dates <- function(){ + + # first get the roles that are relativeluy frequent + role_freq2 = as.data.frame(table(ot$Role_ID)[table(ot$Role_ID)>1000]) + + # then get their start dates +role_freq2$start_date = sapply(role_freq2$Var1, function(x) {min(as.character(ot$tStamp[ot$Role_ID==as.character(x)]))}) + + # then trim it so it's more readable. +role_freq2$start_date = substr(role_freq2$start_date,1,10) + +} \ No newline at end of file diff --git a/ParallelTest/ptest.R b/ParallelTest/ptest.R new file mode 100644 index 0000000..3957931 --- /dev/null +++ b/ParallelTest/ptest.R @@ -0,0 +1,26 @@ +########################################################################################################## +# THREADNET: Batch processing for larger data sets + +# (c) 2017 Michigan State University. This software may be used according to the terms provided in the +# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? +# Absolutely no warranty! +########################################################################################################## + +library(parallel) + +ptest <-function(o) { + x<<-0 + oo<<-o + + l=1:nrow(o) + # return( sapply(l,f1 ) ) + + new_o = cbind(o, sapply(l,f1 ) ) + return(new_o) +} + +f1 <- function(r){ + x<<-x+1 + return(as.character(oo[r,5])) +} + diff --git a/Stats/precedence_functions.R b/Stats/precedence_functions.R new file mode 100644 index 0000000..888851e --- /dev/null +++ b/Stats/precedence_functions.R @@ -0,0 +1,38 @@ + +# need to test if this is working correctly +# It should be counting the number of times that each element in the lexicon precedes each other element. + +# make some data +S=lapply(1:10,function(x) {c('a','b','a','c',sample(letters[5:6],1+round(runif(1)*10),replace=TRUE)) }) + +# take list of sequences, S +precedes <- function(S){ + + # Get lexicon + lex = sort(unique(unlist(S))) + + # for each element in the lexicon... + result = matrix(data = sapply(lex, function(a){ + + # Find its position(s) in each sequence. pos is a list of vectors + pos=sapply(S, function(s) {grep(a,s)}) + + # now for the last position in each string, find the lexical elements that precede it + # get the substrings + subS = mapply(function(p,s){if (max(p)>1) s[1:max(p)]},pos, S) + + # Now find all of the other lexical elements that precede a in each subS + colSums(sapply(lex,function(x){str_detect(subS,x)})) } + ) , + + ncol=length(lex)) + + rownames(result) = lex + colnames(result) = lex + + # normalize by the number of sequences,so "always" = 1 + result = result/length(S) + + return(result) + +} \ No newline at end of file diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R index e0fb1ac..b105277 100644 --- a/Tables and Figures/Clinic_day_plots.R +++ b/Tables and Figures/Clinic_day_plots.R @@ -34,11 +34,234 @@ plots_for_papers <- function(){ plot(cds$Clinic.x,cds$A_NetComplexity) # Clinic change over time, with and without smoothing - ggplot(data = cds, aes(x = ymd, y = corr_with_first, group=Clinic)) + geom_line(aes(color=Clinic)) - ggplot(data = cds, aes(x = ymd, y = rollmean(corr_with_first,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic)) + ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Clinic)) + geom_line(aes(color=Clinic)) + ggplot(data = cds, aes(x = ymd, y = rollmean(Dist_from_reference,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic)) + # Dec 6th version + ggplot(data = cdt, aes(x = ymd.x, y = Dist_from_reference, group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) + + + ggplot(data = cds_435_1 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435_1 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + # Here are the pairs + ggplot(data = cds_435 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + ggplot(data = cds_435 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + + geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank()) + + # these are based on sub-sets of the data by phase + ggplot(data = cdp2, aes(x = ymd, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic)) + + geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank()) + + + + # ROle change over time + ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role)) + + # get the wait times and visit duration for each clinic + # convert to minutes. + a= visits %>% + group_by(Clinic) %>% + summarize(n = n(), + NEvents = mean(NEvents, na.rm = TRUE), + wait1 = mean(wait1*60, na.rm = TRUE), + sdwait1 = sd(wait1*60, na.rm = TRUE), + wait2 = mean(wait2*60, na.rm = TRUE), + sdwait2 = sd(wait2*60, na.rm = TRUE), + Duration = mean(VisitDuration, na.rm = TRUE) + ) + + # For some reason, the sd function was not working correctly above... do it again here. + visits %>% + group_by(Clinic) %>% + summarize(n = n(), + sdwait1 = sd(wait1*60, na.rm = TRUE), + sdwait2 = sd(wait2*60, na.rm = TRUE) + ) + + # get the precentage of actions by role... based on occurrences + Clinic_role_pct = otr %>% + group_by(Clinic, Role) %>% + summarize(n=n()) %>% + mutate(RolePct = n/sum(n)) %>% + select(Clinic, Role, RolePct) %>% + spread(Role, RolePct) + + # get LOS by clinic based on visits + Clinic_LOS_pct = visits %>% + group_by(Clinic, LOS_CPT) %>% + summarize(n=n()) %>% + mutate(LOSPct = n/sum(n)) %>% + spread(Clinic, Role) + + # Make contingency table Clinic x LOS + C_LOS = visits %>% + group_by(Clinic, LOS_CPT) %>% + summarize(n=n()) %>% + spread( LOS_CPT,n) + + # Convert NA to zero and then convert to table. Carefully remove rows/cols that make no sense + C_LOS= as.data.frame(C_LOS) + C_LOS[is.na(C_LOS)] =0 + C_LOS = matrix(unlist(C_LOS), nrow=6, ncol=20)[1:5,2:19] + chisq.test(C_LOS) + + +# Make contingency table Clinic x Role +C_R = otr %>% + group_by(Clinic, Role) %>% + summarize(n=n()) %>% + spread( Role, n) + +# Convert NA to zero +C_R = as.data.frame(C_R) +C_R[is.na(C_R)] =0 +C_R = matrix(unlist(C_R), nrow=5, ncol=10)[1:5,2:10] +chisq.test(C_R) + + Clinic_staffing = otr %>% + group_by(Clinic,Role) %>% + summarize(total_staff=n_distinct(Role_ID)) %>% + spread( Role, total_staff) + + + # Get the handoffs per visit. Use the distinct chunks in the Visit-Role threads (VRThrds) + Role_Handoffs = VRThreads %>% + group_by(Clinic,Visit_ID) %>% + summarize(ho = n_distinct(threadNumVR)) + + Role_Handoffs_by_clinic = Role_Handoffs %>% + group_by(Clinic) %>% + summarize(avg_ho = mean(ho), + median_ho = median(ho), + max_ho = max(ho), + sd_ho = sd(ho)) + + # get the average staffing levels per day + Clinic_daily_staffing = otr %>% + group_by(Clinic,ymd,Role) %>% + summarize(total_staff=n_distinct(Role_ID)) + + Clinic_AVG_daily_staffing = Clinic_daily_staffing %>% + group_by(Clinic,Role) %>% + summarize(AVG_staff=mean(total_staff, na.rm = TRUE)) %>% + spread( Role, AVG_staff) + + Clinic_AVG_daily_staffing = Clinic_daily_staffing %>% + group_by(Clinic,Role) %>% + summarize(AVG_staff=median(total_staff, na.rm = TRUE)) %>% + spread( Role, AVG_staff) + + # let's look at the actions by clinic_role, using the visit_role threads +actions_by_role = VRThreads %>% + group_by(Clinic,Role_VR) %>% + summarize(avgActions = mean(Action_countVR)) %>% + spread( Role_VR, avgActions) + +# Look at alignment by clinic +visits %>% + group_by(Clinic,Phase) %>% + summarize(w=max(ThreadDuration,na.rm = TRUE) ) %>% + spread( Phase, w) + + +# Get LOS by clinic-day +LOS_by_clinic_day = visits %>% + group_by(Clinic_ymd) %>% + summarize(los=mean(LOS,na.rm = TRUE) ) + + +# See if diagnoses differ by time period +Diag_by_phase = visits %>% + group_by(Phase,Diagnosis_Group) %>% + summarize(n=n() ) %>% + spread( Diagnosis_Group, n) + +# See if actions differ by time period +# cds_435_1 is a dataframe with one for for each clinic_day and ~300 columns for the actions +Action_by_phase = cds_435_1 %>% + group_by(Phase) %>% + summarize_at(8:307,median ) + } + +remove_outliers <- function(x, na.rm = TRUE, ...) { + qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...) + H <- 1.5 * IQR(x, na.rm = na.rm) + y <- x + y[x < (qnt[1] - H)] <- NA + y[x > (qnt[2] + H)] <- NA + y +} + +# converts visit_ID into character string with factor levels for actions +make_visit_string <- function(occ, vid){ + + return(paste(as.character(as.numeric(occ$Action[occ$Visit_ID==as.character(vid)])),collapse = '-')) +} + +# Converts list of actions into regex of factor levels +make_process_pattern <- function(lvls, p){ + + return( paste( sapply(p, function(x) {as.character(which(lvls==x))}) , collapse = '.*') ) +} + +# Find process patterns +find_process_pattern <- function(occ, p, vid){ + + x = make_process_pattern( levels(ot$Action), p) + print(x) + + m = sapply(vid, function(v) { + (grepl( x, make_visit_string(occ, v))) } ) + # + + +# m = grep( x, make_visit_string(occ, vid)) + + return(m) + +} + +dualWindowCosine <- function(df, clinic_col, colrange, windowsize){ + + + rdf = data.frame(clinic = df[windowsize:(nrow(df)-windowsize),..clinic_col], + ymd = df[windowsize:(nrow(df)-windowsize),ymd], + corr = sapply(windowsize:(nrow(df)-windowsize), + function(x) { + distance(rbind( colSums(df[(x-windowsize+1):x,..colrange]),colSums(df[(x+1):(x+windowsize), ..colrange ]) ), + method='cosine' ) }) + ) + return(rdf) +} + + # # make_box_plots <- function(){ # ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",