1. GET ALL FEMALE SNOMEDS (that have the word ‘female’ within the concept_name) and their ancestors
--This script does the following process in VA OMOP:
/***********PROCESS TO COMPARE SNOMED CODES BY GENDER IN OMOP CONDITION TABLE (the last couple steps could be changed to use other domains as well)******************
1. GET ALL FEMALE SNOMEDS (that have the word 'female' within the concept_name) and their ancestors (line 46-57)
2. Get the count of ancestors per female descendant (line 65)
3. GET ALL MALE SNOMEDS (that have the word 'male' within the concept_name) and their ancestors (line 80)
4. Get the count of ancestors per male descendant (line 96)
5. FIND ALL POSSIBLE MALE-FEMALE SNOMED MATCHES THAT HAVE THE SAME NUMBER OF ANCESTORS (line 116-134)
6. CHECK WHETHER RESULTING MATCHES FROM step 5 HAVE THE SAME ANCESTORS (line 138-212)
a. CREATE WHILE LOOP TO CHECK WHETHER ALL ROWS IN #ANCESTORCT_MATCHES TABLE ARE
TRUE MATCHES OR JUST HAPPEN TO HAVE THE SAME NUMBER OF ANCESTORS AND BE OPPOSITE GENDER CODES
7. create permanent table with gender relatives (snogen.SNOMED_GenderRelatives) (line 220)
8. Create SNOMED gender relatives BASED ON THE ONLY DIFFERENCE IS MALE VS FEMALE IN THE CONCEPT_NAME, EVEN IF ANCESTRY IS NOT AN EXACT MATCH (line 239)
9. Create new table with ancestry matches and concept_name matches (snogen.SNOMED_GenderRelatives2) (line 251-281)
10. DEFINE THE DOMAINS THESE GENDER SPECIFIC SNOMEDS BELONG TO (line 287)
11. Pull all female codes in condition table that have a male gender snomed relative / Pull all male codes in condition table that have a female gender snomed relative (line 305-328)
12. create summary table (337-366)
13. continue with any other relevant domains be repeating steps 11 and 12 for different OMOP domains (ex - PROCEDURE, OBSERVATION, MEASUREMENT, etc...)...
***/
/*OMOP TABLES USED (with a find and replace for each of these, the specific paths can be changed and the complete script can be run):
OMOPV5.CONCEPT_ANCESTOR
OMOPV5.CONCEPT
OMOPV5.CONDITION_OCCURRENCE
OMOPV5.PERSON
*/
/*Permanent tables created and used:
snogen.SNOMED_GenderRelatives
snogen.SNOMED_GenderRelatives2
snogen.discordance_summary
*/
/*INSTRUCTIONS:
DEFINE OMOP TABLE PATHS FOR CONCEPT, CONCEPT_ANCESTOR, CONDITION_OCCURRENCE, and PERSON TABLES in script
DEFINE PERMANENT TABLE PATHS (alter or create snogen schema in preferred database)
DEFINE DATABASE WHERE permanent tables will be created.
RUN SCRIPT & EVALUATE resulting snogen.discordance_summary table
*/
--define database where permanent tables will be created
USE DATABASE_NAME_HERE
GO
--define schema where tables will be created
CREATE SCHEMA [snogen]
GO
----GET ALL FEMALE SNOMEDS (that have the word 'female' within the concept_name) and their ancestors
DROP TABLE IF EXISTS #allsnomed;
select distinct ANCESTOR_CONCEPT_ID, MIN_LEVELS_OF_SEPARATION, c.concept_name, c.concept_code, c.vocabulary_id, a.DESCENDANT_CONCEPT_ID
into #allsnomed
from OMOPV5.CONCEPT_ANCESTOR a
inner join OMOPV5.CONCEPT c
on a.ANCESTOR_CONCEPT_ID=c.CONCEPT_ID
inner join OMOPV5.CONCEPT d
on a.DESCENDANT_CONCEPT_ID=d.CONCEPT_ID
where d.CONCEPT_NAME LIKE '%female%' and d.VOCABULARY_ID = 'SNOMED' --25434 rows affected (includes MedDRA ancestors)
and c.VOCABULARY_ID = 'SNOMED' --15726
order by MIN_LEVELS_OF_SEPARATION
--(17543 rows affected)
--count the number of unique descendant codes (female codes)
--SELECT COUNT(*) , COUNT(DISTINCT descendant_concept_id)
--FROM #allsnomed WHERE MIN_LEVELS_OF_SEPARATION = 0 --795
---get the count of ancestors per descendant
DROP TABLE IF EXISTS #femaledescendants;
select a.*, ancestorct
INTO #femaledescendants
FROM #allsnomed a
INNER JOIN (select DESCENDANT_CONCEPT_ID, COUNT(*) ancestorct
FROM #allsnomed
where MIN_LEVELS_OF_SEPARATION<>0
GROUP BY DESCENDANT_CONCEPT_ID) b --Count the ancestors (this will include the descendant as an ancestor to itself)
ON a.DESCENDANT_CONCEPT_ID = b.DESCENDANT_CONCEPT_ID
WHERE MIN_LEVELS_OF_SEPARATION = 0
--795 rows in 0 sec
---GET ALL MALE SNOMEDS (that have the word 'male' within the concept_name) and their ancestors
DROP TABLE IF EXISTS #allsnomed_m;
select distinct ANCESTOR_CONCEPT_ID, MIN_LEVELS_OF_SEPARATION, c.concept_name, c.concept_code, c.vocabulary_id, a.DESCENDANT_CONCEPT_ID
into #allsnomed_m
from OMOPV5.CONCEPT_ANCESTOR a
inner join OMOPV5.CONCEPT c
on a.ANCESTOR_CONCEPT_ID=c.CONCEPT_ID
inner join OMOPV5.CONCEPT d
on a.DESCENDANT_CONCEPT_ID=d.CONCEPT_ID
where d.CONCEPT_NAME LIKE '%male%' and d.concept_name not like '%female%'
and d.VOCABULARY_ID = 'SNOMED'
and c.VOCABULARY_ID = 'SNOMED'
order by MIN_LEVELS_OF_SEPARATION
--(12286 row(s) affected) in dbs101
--(25434 rows affected) in rb03
---get the count of ancestors per descendant
DROP TABLE IF EXISTS #maledescendants;
select a.*, ancestorct
INTO #maledescendants
FROM #allsnomed_m a
INNER JOIN (select DESCENDANT_CONCEPT_ID, COUNT(*) ancestorct
FROM #allsnomed_m
where MIN_LEVELS_OF_SEPARATION<>0
GROUP BY DESCENDANT_CONCEPT_ID) b --Count the ancestors (this will include the descendant as an ancestor to itself)
ON a.DESCENDANT_CONCEPT_ID = b.DESCENDANT_CONCEPT_ID
WHERE MIN_LEVELS_OF_SEPARATION = 0
--589 rows in 0 sec
------------------------------------------------------------------------------
----FIND ALL POSSIBLE MALE-FEMALE SNOMED MATCHES THAT HAVE THE SAME NUMBER OF ANCESTORS
DROP TABLE IF EXISTS #ancestorct_matches;
with female as (
select *--, ROW_NUMBER()over(partition by descendant_concept_id order by min_levels_of_separation, concept_code) ancestor_id
from #femaledescendants --order by MIN_LEVELS_OF_SEPARATION
--where DESCENDANT_CONCEPT_ID=195007
)
, male as (
select *--, ROW_NUMBER()over(partition by descendant_concept_id order by min_levels_of_separation, concept_code) ancestor_id
FROm #maledescendants
--where DESCENDANT_CONCEPT_ID = 40480232
)
select a.DESCENDANT_CONCEPT_ID as F_Concept_ID, a.CONCEPT_NAME as F_Concept_name, a.CONCEPT_CODE as F_Concept_code,
b.DESCENDANT_CONCEPT_ID as M_Concept_ID, b.CONCEPT_NAME as M_Concept_name, b.CONCEPT_CODE as M_Concept_code, b.ancestorct as AncestorCt,
ROW_NUMBER() OVER(ORDER BY b.DESCENDANT_CONCEPT_ID) as ID --create id to be used by the looping function to go through every row
INTO #ancestorct_matches
FROM female a
inner join male b
on a.ancestorct = b.ancestorct
--12865
------------------------CHECK WHETHER THESE MATCHES HAVE THE SAME ANCESTORS
ALTER TABLE #ancestorct_matches
ADD code_match INT
GO
/************
--CREATE WHILE LOOP TO CHECK WHETHER ALL ROWS IN #ANCESTORCT_MATCHES TABLE ARE
--TRUE MATCHES OR JUST HAPPEN TO HAVE THE SAME NUMBER OF ANCESTORS AND BE OPPOSITE GENDER CODES
The following Loop runs through all rows of the #ancestorct_matches table:
it checks if the gender codes in the table (which have already been determined to have the same number of ancestors) also
have the exact same ancestors.
It does this by looking at each male-female snomed (matched by ancestor count):
It pulls all ancestors for each descendant and checks if the count(*) of the male ancestors inner joined to the
female ancestors ON the ancestor_code and the min_levels_of_separation is the same as the count they were matched on.
Since the table they pull from is unique, (one row per descendant-ancestor combination), then this should be the same
only if all of their ancestors are the exact same.
*************/
DECLARE @cntr INT = 0;
DECLARE @total INT
SELECT @total = COUNT(*) FROM #ancestorct_matches
print(@total)
DECLARE @m_concept_ID bigint
DECLARE @f_concept_ID bigint
DECLARE @ancestorct int
DECLARE @conceptmatchct int
DECLARE @match int
WHILE @cntr<@total
BEGIN
--define the concept IDs to check for matches that exist in the row of the @cntr
SELECT @m_concept_ID = m_concept_ID, @f_concept_ID = F_Concept_ID, @ancestorct = AncestorCt
FROM #ancestorct_matches where ID = @cntr;
--define the rows in the tables for each ancestry that belong to the descendant concepts
With fem as (
SELECT *
FROM #allsnomed
where DESCENDANT_CONCEPT_ID = @f_concept_ID
and MIN_LEVELS_OF_SEPARATION <> 0
), men as (
SELECT *
FROM #allsnomed_m
where DESCENDANT_CONCEPT_ID = @m_concept_ID
and MIN_LEVELS_OF_SEPARATION <> 0
)
--see if the rows match between tables for each descendant
SELECT @conceptmatchct = COUNT(*)
--SELECT f.*, m.*
FROM fem f
INNER JOIN men m
ON f.ANCESTOR_CONCEPT_ID = m.ANCESTOR_CONCEPT_ID
and f.MIN_LEVELS_OF_SEPARATION = m.MIN_LEVELS_OF_SEPARATION
--if the join returns the same number of ancestors, see if ancestors are the same
IF @conceptmatchct = @ancestorct
BEGIN
SET @match = 1
UPDATE #ancestorct_matches
SET code_match = 1
FROM #ancestorct_matches
WHERE id = @cntr
END
IF @conceptmatchct <> @ancestorct
BEGIN
UPDATE #ancestorct_matches
SET code_match = 0
FROM #ancestorct_matches
WHERE id = @cntr
END
--print('M_concept:'+cast(@m_concept_ID as varchar(10))+', F_concept:'+cast(@f_concept_ID as varchar(10))+', Match = '+cast(@Match as varchar(8)))
--print('ancestor match count = '+ cast(@conceptmatchct as varchar(8)))
--print('ancestor count = '+ cast(@ancestorct as varchar(8)));
SET @cntr = @cntr+1
END
---evaluate ancestor matches by count and ancestor matches by same ancestry
--select COUNT(*)
--FROM #ancestorct_matches--12865
--where code_match = 1 --271 matches that have the exact same ancestry
--create permanent table with gender relatives
DROP TABLE IF EXISTS snogen.SNOMED_GenderRelatives;
select *
INTO snogen.SNOMED_GenderRelatives
FROM #ancestorct_matches
where code_match = 1 --264 matches that have the exact same ancestry
select F_Concept_ID, F_Concept_name, F_Concept_code, M_Concept_ID, M_Concept_name, M_Concept_code
, AncestorCt as F_ancestorct, ancestorct as m_ancestor_ct
INTO #ancestorrelatives
FROM snogen.SNOMED_GenderRelatives a
--SELECT COUNT(distinct M_concept_ID), COUNT(distinct F_concept_iD)
--FROM #ancestorct_matches
--where code_match = 1 --145 155
/*********
This table is not 1 to 1, but many to many... This means that some male codes have the same ancestry as multiple female codes and vice versa.
In fact, many female codes have the same ancestry as many male codes, which may make determination of the exact miscode difficult.
It may be possible to evaluate any mis-gendered codings due to the small number of resulting matches here (only 264 total matches)
for 140 male concepts and 151 female concepts.
********/
---ALSO NEED TO ADD IN SNOMED RELATIVES BASED ON THE ONLY DIFFERENCE IS MALE VS FEMALE IN THE CONCEPT_NAME, EVEN IF ANCESTRY IS NOT AN EXACT MATCH
drop table #sameConceptname;
select female.concept_id as f_concept_id, female.CONCEPT_NAME as f_concept_name, female.CONCEPT_CODE as F_concept_code
,male.CONCEPT_ID m_Concept_ID, male.concept_name as M_Concept_name, male.concept_code as M_concept_code
into #sameConceptname
from OMOPV5.CONCEPT female
inner join OMOPV5.concept male
on ( replace(male.CONCEPT_NAME,'male','') = replace(female.concept_name,'female',''))
and (female.vocabulary_id='snomed' and male.VOCABULARY_ID='snomed') and (female.STANDARD_CONCEPT='s' and male.STANDARD_CONCEPT='s')
and (male.CONCEPT_NAME like '%male%' and female.CONCEPT_NAME like '%female%')
order by female.CONCEPT_NAME
--359 rows
--Create new table with ancestry matches and concept_name matches
DROP TABLE IF EXISTS snogen.SNOMED_GenderRelatives2;
SELECT a.*, b.ancestorct f_ancestorct, c.ancestorct m_ancestorct
INTO snogen.SNOMED_GenderRelatives2
FROM #sameConceptname a
inner join #femaledescendants b
ON a.f_concept_id = b.DESCENDANT_CONCEPT_ID
inner join #maledescendants c
ON a.m_concept_id = c.DESCENDANT_CONCEPT_ID
UNION
SELECT F_Concept_ID, F_Concept_name, F_Concept_code, M_Concept_ID, M_Concept_name, M_Concept_code
, AncestorCt as F_ancestorct, ancestorct as m_ancestor_ct FROM snogen.SNOMED_GenderRelatives
--510 rows
ALTER TABLE snogen.SNOMED_GenderRelatives2
ADD AncestorTreeMatch INT,
NameMatch INT,
AncestorCtMatch INT
GO
UPDATE snogen.SNOMED_GenderRelatives2
SET AncestorTreeMatch = CASE WHEN CAST(f_concept_ID as varchar(12)) + CAST(m_concept_ID as varchar(12))
in (select CAST(f_concept_ID as varchar(12)) + CAST(m_concept_ID as varchar(12))
FROM #ancestorrelatives) THEN 1 ELSE 0 END,
NameMatch = CASE WHEN CAST(f_concept_ID as varchar(12)) + CAST(m_concept_ID as varchar(12))
in (select CAST(f_concept_ID as varchar(12)) + CAST(m_concept_ID as varchar(12))
FROM #sameConceptname) THEN 1 ELSE 0 END,
AncestorCtMatch = CASE WHEN CAST(f_concept_ID as varchar(12)) + CAST(m_concept_ID as varchar(12))
in (select CAST(f_concept_ID as varchar(12)) + CAST(m_concept_ID as varchar(12))
FROM #ancestorct_matches) THEN 1 ELSE 0 END
FROM snogen.SNOMED_GenderRelatives2
--510 rows
SELECT a.*, b.CONCEPT_NAME, b.CONCEPT_CODE, b.VOCABULARY_ID, b.STANDARD_CONCEPT
FROM
(select distinct DESCENDANT_CONCEPT_ID FROM #allsnomed) a
INNER JOIN OMOPV5.CONCEPT b
on a.DESCENDANT_CONCEPT_ID = b.CONCEPT_ID and b.CONCEPT_NAME like '%pregnancy%'
left join snogen.SNOMED_GenderRelatives2 c
on a.DESCENDANT_CONCEPT_ID = c.F_Concept_ID
and c.F_Concept_ID is null
----------DEFINE THE DOMAINS THESE BELONG TO:
/*
SELECT a.DOMAIN_ID, COUNT(*) ct
FROM OMOPV5.CONCEPT a
INNER JOIN (SELECT DISTINCT F_concept_ID FROM OMOP_V5_QA.snogen.SNOMED_GenderRelatives2) b
ON a.CONCEPT_ID = b.F_Concept_ID
GROUP BY a.DOMAIN_ID
*/
/* --all of these domains can be checked for discordant coding
Condition 154
Device 42
Measurement 1
Observation 40
Procedure 65
Relationship 3
Spec Anatomic Site 85
Specimen 2
*/
------------Pull all female codes in condition table that have a male gender snomed relative
DROP TABLE IF EXISTS snogen.CONDITIONS_genderF;
SELECT DISTINCT a.Person_ID, CONDITION_CONCEPT_ID, a.CONDITION_START_DATE
, CONDITION_SOURCE_CONCEPT_ID
INTO snogen.CONDITIONS_genderF
FROM OMOPV5.CONDITION_OCCURRENCE a
INNER JOIN (SELECT DISTINCT F_concept_ID FROM snogen.SNOMED_GenderRelatives2) b
ON a.CONDITION_CONCEPT_ID = b.F_Concept_ID
INNER JOIN OMOPV5.PERSON c
ON a.person_ID = c.PERSON_ID
--(910338 rows affected) in 13 min 54 sec
------------Pull all male codes in condition table that have a female gender snomed relative
DROP TABLE IF EXISTS snogen.CONDITIONS_genderM;
SELECT DISTINCT a.Person_ID, CONDITION_CONCEPT_ID, a.CONDITION_START_DATE
, CONDITION_SOURCE_CONCEPT_ID
INTO snogen.CONDITIONS_genderM
FROM OMOPV5.CONDITION_OCCURRENCE a
INNER JOIN (SELECT DISTINCT M_concept_ID FROM snogen.SNOMED_GenderRelatives2) b
ON a.CONDITION_CONCEPT_ID = b.M_Concept_ID
INNER JOIN OMOPV5.PERSON c
ON a.person_ID = c.PERSON_ID
--(337167 rows affected) in 8 min 47 sec
------evaluating discordance------------------------------------------------------------
DROP TABLE IF EXISTS snogen.discordance_summary;
select CONDITION_CONCEPT_ID, b.M_Concept_name as Concept_name, 'male' as ConceptGender
, COUNT(distinct Person_ID) pat_ct
, COUNT(*) as instance_ct
, SUM(CASE WHEN gender_source_value = 'F' THEN 1 ELSE 0 END) as FemaleInstanceCt
, SUM(CASE WHEN gender_source_value = 'M' THEN 1 ELSE 0 END) as MaleInstanceCt
, CAST(SUM(CASE WHEN gender_source_value = 'F' THEN 1 ELSE 0 END)*100.0 /COUNT(*) as dec(4,2)) DiscordantInstancepct
INTO snogen.discordance_summary
FROM snogen.Conditions_genderm a
INNER JOIN snogen.SNOMED_GenderRelatives2 b
on a.CONDITION_CONCEPT_ID = b.m_Concept_ID
GROUP BY CONdition_CONCEPT_ID, b.M_Concept_name
UNION
select CONDITION_CONCEPT_ID, b.F_Concept_name as Concept_name, 'female' as ConceptGender
, COUNT(distinct Person_ID) pat_ct
, COUNT(*) as instance_ct
, SUM(CASE WHEN gender_source_value = 'F' THEN 1 ELSE 0 END) as FemaleInstanceCt
, SUM(CASE WHEN gender_source_value = 'M' THEN 1 ELSE 0 END) as MaleInstanceCt
, CAST(SUM(CASE WHEN gender_source_value = 'M' THEN 1 ELSE 0 END)*100.0 /COUNT(*) as dec(4,2)) DiscordantInstancepct
FROM snogen.Conditions_genderF a
INNER JOIN snogen.SNOMED_GenderRelatives2 b
on a.CONDITION_CONCEPT_ID = b.f_Concept_ID
GROUP BY CONdition_CONCEPT_ID, b.f_Concept_name
--50 rows in 2 sec
-- evaluating concepts with large discordance (>100 instances)
select * FROM snogen.discordance_summary
where (ConceptGender = 'female' and MaleInstanceCt>100)
OR
( ConceptGender = 'male' and FemaleInstanceCt>100)
order by DiscordantInstancepct desc
select conceptgender, COUNT(*) conceptct, sum(instance_ct) instancect
, sum(FemaleInstanceCt) femaleinstancect, sum(maleinstancect) maleinstancect
FROM snogen.discordance_summary
GROUP BY conceptgender
/* OTHER EVALUATIONS
select a.*, b.CONCEPT_CODE as SNOMED_Code
FROM snogen.discordance_summary a
inner join OMOPV5.concept b
on a.CONDITION_CONCEPT_ID = b.CONCEPT_ID
WHERE DiscordantInstancepct>0
order by DiscordantInstancepct desc
select SUM(instance_ct)
FROM (
select MaleInstanceCt instance_ct
FROM snogen.discordance_summary
WHERE ConceptGender = 'female'
UNION
SELECT femaleinstancect
FROM snogen.discordance_summary
WHERE ConceptGender = 'male'
) a
select COUNT_BIG(*) instance_ct, COUNT(distinct condition_concept_id) as conceptct
FROM OMOPV5.CONDITION_OCCURRENCE
WHERE condition_concept_ID IN
(
SELECT DISTINCT F_concept_ID FROM snogen.SNOMED_GenderRelatives2
UNION
SELECT DISTINCT m_concept_ID FROM snogen.SNOMED_GenderRelatives2
)
*/