# Databricks notebook source # DBTITLE 1,201805 dataset parameters # prerequisites: # 1: a (scopus) Hive table with affiliation/author/citation data # 2: for the subject classifications, 2 Hive tables: # a: ScienceMetrix 176 subfield classifications to Source-IDs # b: ScienceMetrix 176 subfields, 22 fields and 6 domains hierarchy Last_ani='ani_20180503' # snapshot sort_pub_year='sort_year' # or 'pub_year' minyear='1960' mincityear='1996' maxyear='2017' cachePath='/mnt/dest/bucket/example/path/available/in/databricks/' cacheDb='dbName' # COMMAND ---------- from pyspark.sql.types import * from pyspark.sql import functions as func from pyspark.sql import Window from pyspark.sql import Row # for printing labels on the result columns: Y1=minyear[2:] Y2=mincityear[2:] Y3=maxyear[2:] csvOutFileName_s1='Table-S1_'+Last_ani+'_'+minyear+'_'+mincityear+'_'+maxyear+'_'+sort_pub_year csvOutFileName_s3='Table-S3_'+Last_ani+'_'+minyear+'_'+mincityear+'_'+maxyear+'_'+sort_pub_year df_ani=table('scopus.'+Last_ani) print ("running with: "+Last_ani) sqlContext.sql('DROP TABLE IF EXISTS '+cacheDb+'.temp_df_agg_count') sqlContext.sql('DROP TABLE IF EXISTS '+cacheDb+'.temp_df_agg_count_ln_wmaxc') sqlContext.sql('DROP TABLE IF EXISTS '+cacheDb+'.temp_df_agg_count_ln_max') sqlContext.sql('DROP TABLE IF EXISTS '+cacheDb+'.temp_df_ani_auth_affinst') sqlContext.sql('DROP TABLE IF EXISTS '+cacheDb+'.temp_df_ani_auth_name') # COMMAND ---------- # for each author in Scopus, get their latest record and select the name from that record. # we will use this in the end to label each author record for which we create stats. df_ani_auth_name_t=( df_ani .withColumn('Au',func.explode('Au')) .orderBy('datesort',Ascending=False) #.filter('Au.given_name_pn IS NOT NULL') # no need to filter if we order and prefer the presence of the given_name: .withColumn( 'recRank', func.rank().over( Window .partitionBy('Au.auid') .orderBy( func.expr('IF(Au.given_name_pn IS NULL,0,1)').desc(), func.desc('datesort'), func.desc('Eid'), func.desc('Au.authorseq') # have issues with records with multiple auid repeats on same record. this breaks those ties. ) ) ) .withColumn('lastPubYear',func.max(sort_pub_year).over(Window.partitionBy('Au.auid'))) .withColumn('firstPubYear',func.min(sort_pub_year).over(Window.partitionBy('Au.auid'))) .filter('recRank=1') .select(func.col('Au.auid').alias('auid'),func.col('Eid').alias('LastEidName'),func.expr('CONCAT(Au.surname_pn,", ",IFNULL(Au.given_name_pn,Au.initials_pn))').alias('authfull'),'lastPubYear','firstPubYear') ) tbn_df_ani_auth_name="temp_df_ani_auth_name" if tbn_df_ani_auth_name not in sqlContext.tableNames(cacheDb): print ("Generating "+cacheDb+"."+tbn_df_ani_auth_name) df_ani_auth_name_t.repartition(200).write.mode("overwrite").saveAsTable(cacheDb+"."+tbn_df_ani_auth_name) else: print ("Reusing existing "+cacheDb+"."+tbn_df_ani_auth_name) df_ani_auth_name=table(cacheDb+"."+tbn_df_ani_auth_name) # COMMAND ---------- # citation counts (mincityear+) df_mincityear_onw_cit=( df_ani .filter(sort_pub_year+' >= '+mincityear) .select( func.col('Eid').alias('CitingEid'), func.explode('citations').alias('Eid'), func.col('Au.auid').alias('CitingAuids') ) .distinct() .join( df_ani.select( 'Eid', func.col('Au.auid').alias('CitedAuids') ),["Eid"] ) .withColumn('overLappingAuthors',func.size(func.array_intersect('CitingAuids','CitedAuids'))) .select( "CitingEid", "Eid", func.expr("IF(overLappingAuthors>0,1,0)").alias('isSelfCitation'), func.expr("IF(overLappingAuthors>0,NULL,CitingEid)").alias('CitingEidNonSelf'), ) .groupBy('Eid') .agg( func.count('*').alias('CitationCount'), func.sum('isSelfCitation').alias('SelfCitationCount'), (func.count('*')-func.sum('isSelfCitation')).alias('CitationCountNonSelf'), func.collect_list('CitingEid').alias('CitingEids'), func.collect_list('CitingEidNonSelf').alias('CitingEidsNonSelf'), ) ) # COMMAND ---------- # get science-metrix tables for subject classification; the map scopus source-ids (i.e. journals, serials etc) to S-M classes. df_smc_m=table(cacheDb+".ScienceMetrixMapping_POC_Journal_Subject") df_smc_c=table(cacheDb+".ScienceMetrixMapping_POC_SubjectStructure") # window definitions used in next command to calculate the volume and rank of the (sub)fields per author to assign the best subject per author. w_au=Window.partitionBy('auid') w_au_sf=Window.partitionBy('auid','subfieldCode') w_sf=Window.partitionBy('subfieldCode') # this window won't have ties. That way the dense_rank can be used to de-duplicate the results # (i.e. all those ranked #1 will be the same item, no risk of two items being ranked #1) # note also the first subfieldCode IS NULL: designed to put the count of null subfields to the END of the rank. w_au_sort_sf_auth_overall=Window.partitionBy('auid').orderBy( func.expr('subfieldCode IS NULL').asc(), func.desc('subfieldCode_count_thisauth'), func.asc('subfieldCode_count_overall'), func.asc('subfieldCode') ) w_au_f=Window.partitionBy('auid','fieldCode') w_f=Window.partitionBy('fieldCode') # see comment about ties above. w_au_sort_f_auth_overall=Window.partitionBy('auid').orderBy( func.expr('fieldCode IS NULL').asc(), func.desc('fieldCode_count_thisauth'), func.asc('fieldCode_count_overall'), func.asc('fieldCode') ) # COMMAND ---------- # calculate the metric per author in Scopus. # Note: ns_ and ws_ prefixes as found in the code, indicate NonSelf and WithSelf citation counts used. # list of metrics: # npY1Y3 # papers minyear-maxyear # firstyr year of first publication # lastyr year of most recent publication # ncY2Y3 total cites mincityear-maxyear # ncY2Y3_cp number of citing papers (unique) citing ncY2Y3 # hY3 h-index as of end-maxyear # hmY3 hm-index as of end-maxyear # nps number of single authored papers # ncs total cites to single authored papers # npsf number of single+first authored papers # ncsf total cites to single+first authored papers # npsfl number of single+first+last authored papers # ncsfl total cites to single+first+last authored papers wns = (Window.partitionBy('auid').orderBy(func.desc('CitationCountNonSelf'),'Eid')) wws = (Window.partitionBy('auid').orderBy(func.desc('CitationCount'),'Eid')) wsm = (Window.partitionBy('auid','subfieldCode')) wsm_a = (Window.partitionBy('subfieldCode')) df_agg_count_t=( df_ani #.filter('citation_type IN ("ar","cp","re")') .filter(sort_pub_year+' <= '+maxyear) .join(df_mincityear_onw_cit,["Eid"],"LEFT_OUTER").na.fill({'CitationCountNonSelf':0,'CitationCount':0}) .select('Eid',sort_pub_year,func.size('Au').alias('n_authors'),func.explode('Au').alias('Au'),'CitationCountNonSelf','CitationCount','CitingEidsNonSelf','CitingEids',func.col('source.srcid').alias('srcid'),'citation_type') .withColumn('auid',func.col('Au.auid')).withColumn('Authorseq',func.col('Au.Authorseq')).drop('Au') # join ScienceMetrix to count volume per subfield code .join(df_smc_m,["srcid"],"LEFT_OUTER") .withColumn('subfieldcode',func.expr('IF(citation_type IN ("ar","cp","re"),subfieldcode,NULL)')) # assign field only if ar/cp/re .join(df_smc_c,["subfieldCode"],"LEFT_OUTER") .withColumn('totalDocCountAuthor',func.count('*').over(w_au)) .withColumn('arcpreDocCountAuthor',func.count(func.expr('IF(citation_type IN ("ar","cp","re"),Eid,NULL)')).over(w_au)) .filter('arcpreDocCountAuthor>=5') # only authors with >= 5 ar/cp/re's .withColumn('SMFCMdocCountAuthor',func.count('fieldCode').over(w_au)) # number of documents the author has mapped to a S-M; # calculate ranks for 176-level subfield per author .withColumn('subfieldCode_count_thisauth',func.count('*').over(w_au_sf)) .withColumn('subfieldCode_count_overall',func.count('*').over(w_sf)) .withColumn('subfieldRank',func.dense_rank().over(w_au_sort_sf_auth_overall)) .withColumn( "subfield_tuple", func.struct( 'subfieldRank', 'subfieldCode', 'subfieldName', 'subfieldCode_count_thisauth', 'subfieldCode_count_overall', 'SMFCMdocCountAuthor', func.expr('subfieldCode_count_thisauth/SMFCMdocCountAuthor').alias('subfieldFrac') ) ) # same for 22-level fields .withColumn('fieldCode_count_thisauth',func.count('*').over(w_au_f)) .withColumn('fieldCode_count_overall',func.count('*').over(w_f)) .withColumn('fieldRank',func.dense_rank().over(w_au_sort_f_auth_overall)) .withColumn( "field_tuple", func.struct( 'fieldRank', 'fieldCode', 'fieldName', 'fieldCode_count_thisauth', 'fieldCode_count_overall', 'SMFCMdocCountAuthor', func.expr('fieldCode_count_thisauth/SMFCMdocCountAuthor').alias('fieldFrac') ) ) # ranks and sums needed to calculate h and hm index .withColumn('ns_r_eff',func.sum(1/func.col('n_authors')).over(wns.rangeBetween(Window.unboundedPreceding, 0))) .withColumn('ns_r',func.rank().over(wns)) .withColumn('ws_r_eff',func.sum(1/func.col('n_authors')).over(wws.rangeBetween(Window.unboundedPreceding, 0))) .withColumn('ws_r',func.rank().over(wws)) .groupBy('auid') .agg( func.sort_array(func.collect_set("subfield_tuple"),True).alias("subFields"), func.sort_array(func.collect_set("field_tuple"),True).alias("Fields"), func.sum(func.expr('IF('+sort_pub_year+' BETWEEN '+minyear+' AND '+maxyear+',1,0)')).alias('npY1Y3'), # no longer capture first/last here; we want to get those values from the full database and therefore collect them with the author names dataframe (where we also get the last known full prefereed name) #func.min(sort_pub_year).alias('firstyr'), #func.max(sort_pub_year).alias('lastyr'), func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'), func.size(func.array_distinct(func.flatten(func.collect_list('CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'), func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias('ns_hY3'), func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)')).alias('ns_hmY3'), func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'), func.sum(func.expr('IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias('ns_npsf'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')).alias('ns_ncsf'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,1,0)')).alias('ns_npsfl'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,CitationCountNonSelf,0)')).alias('ns_ncsfl'), func.sum('CitationCount').alias('ws_ncY2Y3'), func.size(func.array_distinct(func.flatten(func.collect_list('CitingEids')))).alias('ws_ncY2Y3_cp'), func.max(func.expr('IF(ws_r<=CitationCount,ws_r,0)')).alias('ws_hY3'), func.max(func.expr('IF(ws_r_eff<=CitationCount,ws_r_eff,0)')).alias('ws_hmY3'), func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ws_nps'), func.sum(func.expr('IF(n_authors=1,CitationCount,0)')).alias('ws_ncs'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias('ws_npsf'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,CitationCount,0)')).alias('ws_ncsf'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,1,0)')).alias('ws_npsfl'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,CitationCount,0)')).alias('ws_ncsfl'), ) ) # store this as a table if it doesn't already exist. tbn_df_agg_count="temp_df_agg_count" if tbn_df_agg_count not in sqlContext.tableNames(cacheDb): print ("Generating "+cacheDb+"."+tbn_df_agg_count) df_agg_count_t.repartition(200).write.mode("overwrite").saveAsTable(cacheDb+"."+tbn_df_agg_count) else: print ("Reusing existing "+cacheDb+"."+tbn_df_agg_count) df_agg_count=table(cacheDb+"."+tbn_df_agg_count) # more metrics: log scales of the calculated metrics. # lnc ln(ncY2Y3+1) # lh ln(hY3+1) # lhm ln(hmY3+1) # lns ln(ncs+1) # lnsf ln(ncsf+1) # lnsfl ln(ncsfl+1) df_agg_count_ln=( df_agg_count .withColumn('ns_lnc',func.log(func.expr('ns_ncY2Y3+1'))) .withColumn('ns_lh',func.log(func.expr('ns_hY3+1'))) .withColumn('ns_lhm',func.log(func.expr('ns_hmY3+1'))) .withColumn('ns_lns',func.log(func.expr('ns_ncs+1'))) .withColumn('ns_lnsf',func.log(func.expr('ns_ncsf+1'))) .withColumn('ns_lnsfl',func.log(func.expr('ns_ncsfl+1'))) .withColumn('ws_lnc',func.log(func.expr('ws_ncY2Y3+1'))) .withColumn('ws_lh',func.log(func.expr('ws_hY3+1'))) .withColumn('ws_lhm',func.log(func.expr('ws_hmY3+1'))) .withColumn('ws_lns',func.log(func.expr('ws_ncs+1'))) .withColumn('ws_lnsf',func.log(func.expr('ws_ncsf+1'))) .withColumn('ws_lnsfl',func.log(func.expr('ws_ncsfl+1'))) ) # and to complement further, the MAX of the log functions above (aggregate across the DB) # flnc lnc/max(lnc) # flh lh/max(lh) # flhm lhm/max(lhm) # flns lns/max(lns) # flnsf lnsf/max(lnsf) # flnsfl lnsfl/max(lnsfl) # c composite: sum of six fractions (flnc+flh+flhm+flns+flnsf+flnsfl) df_agg_count_ln_max_t=df_agg_count_ln.agg( func.max('ns_lnc').alias('ns_maxlnc'), func.max('ns_lh').alias('ns_maxlh'), func.max('ns_lhm').alias('ns_maxlhm'), func.max('ns_lns').alias('ns_maxlns'), func.max('ns_lnsf').alias('ns_maxlnsf'), func.max('ns_lnsfl').alias('ns_maxlnsfl'), func.max('ws_lnc').alias('ws_maxlnc'), func.max('ws_lh').alias('ws_maxlh'), func.max('ws_lhm').alias('ws_maxlhm'), func.max('ws_lns').alias('ws_maxlns'), func.max('ws_lnsf').alias('ws_maxlnsf'), func.max('ws_lnsfl').alias('ws_maxlnsfl') ) tbn_df_agg_count_ln_max="temp_df_agg_count_ln_max" if tbn_df_agg_count_ln_max not in sqlContext.tableNames(cacheDb): print ("Generating "+cacheDb+"."+tbn_df_agg_count_ln_max) df_agg_count_ln_max_t.repartition(200).write.mode("overwrite").saveAsTable(cacheDb+"."+tbn_df_agg_count_ln_max) else: print ("Reusing existing "+cacheDb+"."+tbn_df_agg_count_ln_max) df_agg_count_ln_max=table(cacheDb+"."+tbn_df_agg_count_ln_max) # now bring the aggregate back into the metrics table so we can calculate the normalized score. df_agg_count_ln_wmaxc_t=( df_agg_count_ln .crossJoin(df_agg_count_ln_max) #.withColumn('flnc',func.expr('lnc/(max(lnc) over(Partition By true))')) .withColumn('ns_flnc',func.expr('ns_lnc/ns_maxlnc')).drop('ns_maxlnc') .withColumn('ns_flh',func.expr('ns_lh/ns_maxlh')).drop('ns_maxlh') .withColumn('ns_flhm',func.expr('ns_lhm/ns_maxlhm')).drop('ns_maxlhm') .withColumn('ns_flns',func.expr('ns_lns/ns_maxlns')).drop('ns_maxlns') .withColumn('ns_flnsf',func.expr('ns_lnsf/ns_maxlnsf')).drop('ns_maxlnsf') .withColumn('ns_flnsfl',func.expr('ns_lnsfl/ns_maxlnsfl')).drop('ns_maxlnsfl') # c = the composite metric .withColumn('ns_c',func.expr('ns_flnc+ns_flh+ns_flhm+ns_flns+ns_flnsf+ns_flnsfl')) .withColumn('ws_flnc',func.expr('ws_lnc/ws_maxlnc')).drop('ws_maxlnc') .withColumn('ws_flh',func.expr('ws_lh/ws_maxlh')).drop('ws_maxlh') .withColumn('ws_flhm',func.expr('ws_lhm/ws_maxlhm')).drop('ws_maxlhm') .withColumn('ws_flns',func.expr('ws_lns/ws_maxlns')).drop('ws_maxlns') .withColumn('ws_flnsf',func.expr('ws_lnsf/ws_maxlnsf')).drop('ws_maxlnsf') .withColumn('ws_flnsfl',func.expr('ws_lnsfl/ws_maxlnsfl')).drop('ws_maxlnsfl') .withColumn('ws_c',func.expr('ws_flnc+ws_flh+ws_flhm+ws_flns+ws_flnsf+ws_flnsfl')) ) tbn_df_agg_count_ln_wmaxc="temp_df_agg_count_ln_wmaxc" if tbn_df_agg_count_ln_wmaxc not in sqlContext.tableNames(cacheDb): print ("Generating "+cacheDb+"."+tbn_df_agg_count_ln_wmaxc) df_agg_count_ln_wmaxc_t.repartition(200).write.mode("overwrite").saveAsTable(cacheDb+"."+tbn_df_agg_count_ln_wmaxc) else: print ("Reusing existing "+cacheDb+"."+tbn_df_agg_count_ln_wmaxc) df_agg_count_ln_wmaxc=table(cacheDb+"."+tbn_df_agg_count_ln_wmaxc) # COMMAND ---------- # in order to label the institution of each author: # SciVal institution mappings Last_ia=max([x for x in sqlContext.tableNames(cacheDb) if len(x)==32 and x[0:26]=="institution_affiliation_20"]) df_ia=table(''+cacheDb+'.'+Last_ia) Last_i=max([x for x in sqlContext.tableNames(cacheDb) if len(x)==20 and x[0:14]=="institution_20"]) df_i=table(''+cacheDb+'.'+Last_i) # for each author in Scopus, get their latest record and select the affiliation[s] from that record. # inst_id most recent institution ID # inst_name institution name # cntry country associated with most recent institution # affil_id most recent Scopus affiliation ID # to get only one instid per afid: df_afid_inst_unique=( df_ia .selectExpr('institution_id','afid') .withColumn('instrank',func.rank().over(Window.partitionBy('afid').orderBy(func.asc('institution_id')))) .filter('instrank=1').drop('instrank') .join(df_i.selectExpr('institution_id','name as sv_name','country_code as sv_country_code'),["institution_id"],"LEFT_OUTER") ) df_ani_auth_affinst_t=( df_ani .withColumn('Au',func.explode('Au')) .withColumn("Au_af", func.explode("Au_af")) .filter('Au.Authorseq=Au_af.Authorseq') # only keep the au-af rows matching this author .selectExpr('*','Af[Au_af.affiliation_seq-1] as affiliation') # only keep the af rows matching this au-af. .drop('Af') .withColumn('afid',func.explode('affiliation.affiliation_ids')) .filter('affiliation IS NOT NULL') # so that we can filter to only the last publication. if it includes nulls we may select "null" as institution, not desirable. # .withColumn('recRank',func.rank().over(Window.partitionBy('Au.auid').orderBy( func.desc('datesort'), # last record(s) # tie breakers: func.desc('Eid'), # so we only have one record func.asc('afid'), # so we only look at one AFID func.asc('Au_af.affiliation_seq'), # in case the same afid occurs multiple times only get one func.asc('Au_af.Authorseq'), # in case the same author ID occurs multiple times using a different sequence. ))) .filter('recRank=1') # to get the last only. .select(func.col('Au.auid').alias('auid'),func.col('Eid').alias('LastEidAff'),'afid','affiliation') .join(df_afid_inst_unique,['afid'],'LEFT_OUTER') .select( 'auid', 'LastEidAff', func.col('institution_id').alias('inst_id'), func.coalesce('sv_name',func.concat_ws(', ','affiliation.affiliation_organization')).alias('inst_name'), func.coalesce('sv_country_code','affiliation.affiliation_tag_country').alias('cntry'), func.col('afid').alias('affil_id') ) .distinct() # remove any duplicates left. This may occur in odd cases where the Au_af field is repeated. # eid 84959828211 # array #0: {"Authorseq":1,"affiliation_seq":1,"validity_B":true} #1: {"Authorseq":1,"affiliation_seq":1,"validity_B":true} ) tbn_df_ani_auth_affinst="temp_df_ani_auth_affinst" if tbn_df_ani_auth_affinst not in sqlContext.tableNames(cacheDb): print ("Generating "+cacheDb+"."+tbn_df_ani_auth_affinst) df_ani_auth_affinst_t.repartition(200).write.mode("overwrite").saveAsTable(cacheDb+"."+tbn_df_ani_auth_affinst) else: print ("Reusing existing "+cacheDb+"."+tbn_df_ani_auth_affinst) df_ani_auth_affinst=table(cacheDb+"."+tbn_df_ani_auth_affinst) # COMMAND ---------- display( df_ani_auth_affinst .withColumn('reccount',func.count('*').over(Window.partitionBy('auid'))) .filter('reccount>1') ) # should be empty response. # COMMAND ---------- # complete to dataset: tie all together and calculate the rank of each author so that we can pull the top 100K. # ord order sorted on c # author_id Scopus AUID # authfull author name ns_wc = (Window.partitionBy(func.lit(True)).orderBy(func.desc('ns_c'))) ws_wc = (Window.partitionBy(func.lit(True)).orderBy(func.desc('ws_c'))) df_agg_result=( df_agg_count_ln_wmaxc .filter('auid IS NOT NULL') .filter("npY1Y3 >= 2") # only rank authors >= 2 papers since 1960 .join(df_ani_auth_name,["auid"],"LEFT_OUTER") .join(df_ani_auth_affinst,["auid"],"LEFT_OUTER") .withColumnRenamed('auid','author_id') .withColumn('ns_ord',func.rank().over(ns_wc)) .withColumn('ws_ord',func.rank().over(ws_wc)) .select( "author_id", "authfull", #"inst_id", "inst_name", "cntry", #"affil_id", func.col("npY1Y3").alias('np'+Y1+Y3), #"firstyr", #"lastyr", func.col('firstPubYear').alias('firstyr'), func.col('lastPubYear').alias('lastyr'), func.col("ns_ord").alias('rank (ns)'), func.col("ns_ncY2Y3").alias('nc'+Y2+Y3+' (ns)'), # not showing citing papers #func.col("ns_ncY2Y3_cp").alias(''), #func.expr("ns_ncY2Y3/ns_ncY2Y3_cp").alias('ns_ncY2Y3_cp_ratio'), func.col("ns_hY3").alias('h'+Y3+' (ns)'), func.col("ns_hmY3").alias('hm'+Y3+' (ns)'), func.col("ns_nps").alias('nps (ns)'), func.col("ns_ncs").alias('ncs (ns)'), func.col("ns_npsf").alias('cpsf (ns)'), func.col("ns_ncsf").alias('ncsf (ns)'), func.col("ns_npsfl").alias('npsfl (ns)'), func.col("ns_ncsfl").alias('ncsfl (ns)'), # no need to store these; the log, and normalized-log are just there to get to c. # "ns_lnc", # "ns_lh", # "ns_lhm", # "ns_lns", # "ns_lnsf", # "ns_lnsfl", # "ns_flnc", # "ns_flh", # "ns_flhm", # "ns_flns", # "ns_flnsf", # "ns_flnsfl", func.col("ns_c").alias('c (ns)'), func.col("ns_ncY2Y3_cp").alias('npciting (ns)'), func.expr("ns_ncY2Y3/ns_ncY2Y3_cp").alias('cprat (ns)'), func.expr("CONCAT(100*(ws_ncY2Y3-ns_ncY2Y3)/ws_ncY2Y3,' %')").alias('self%'), func.col("ws_ord").alias('rank'), func.col("ws_ncY2Y3").alias('nc'+Y2+Y3), func.col("ws_hY3").alias('h'+Y3), func.col("ws_hmY3").alias('hm'+Y3), func.col("ws_nps").alias('nps'), func.col("ws_ncs").alias('ncs'), func.col("ws_npsf").alias('cpsf'), func.col("ws_ncsf").alias('ncsf'), func.col("ws_npsfl").alias('npsfl'), func.col("ws_ncsfl").alias('ncsfl'), # no need to store these; the log, and normalized-log are just there to get to c. # "ws_lnc", # "ws_lh", # "ws_lhm", # "ws_lns", # "ws_lnsf", # "ws_lnsfl", # "ws_flnc", # "ws_flh", # "ws_flhm", # "ws_flns", # "ws_flnsf", # "ws_flnsfl", func.col("ws_c").alias('c'), func.col("ws_ncY2Y3_cp").alias('npciting'), func.expr("ws_ncY2Y3/ws_ncY2Y3_cp").alias('cprat'), func.expr('IF(subfields[0] IS NULL,NULL,IF(subfields[0].SMFCMdocCountAuthor<5,NULL,subfields[0].subfieldCode))').alias('sm-1'), func.expr('IF(subfields[0] IS NULL,NULL,IF(subfields[0].SMFCMdocCountAuthor<5,NULL,subfields[0].subfieldName))').alias('name1'), func.expr('IF(subfields[0] IS NULL,NULL,IF(subfields[0].SMFCMdocCountAuthor<5,NULL,subfields[0].subfieldFrac))').alias('frac1'), func.expr('IF(subfields[1] IS NULL,NULL,IF(subfields[1].SMFCMdocCountAuthor<5,NULL,subfields[1].subfieldCode))').alias('sm-2'), func.expr('IF(subfields[1] IS NULL,NULL,IF(subfields[1].SMFCMdocCountAuthor<5,NULL,subfields[1].subfieldName))').alias('name2'), func.expr('IF(subfields[1] IS NULL,NULL,IF(subfields[1].SMFCMdocCountAuthor<5,NULL,subfields[1].subfieldFrac))').alias('frac2'), func.expr('IF(fields[0] IS NULL,NULL,IF(fields[0].SMFCMdocCountAuthor<5,NULL,fields[0].fieldCode))').alias('sm22'), func.expr('IF(fields[0] IS NULL,NULL,IF(fields[0].SMFCMdocCountAuthor<5,NULL,fields[0].fieldName))').alias('name22'), func.expr('IF(fields[0] IS NULL,NULL,IF(fields[0].SMFCMdocCountAuthor<5,NULL,fields[0].fieldFrac))').alias('frac22'), ) ) df_agg_result.repartition(200).write.mode("overwrite").csv('dbfs:'+cachePath+csvOutFileName_s1+'.csv',header = 'true') # COMMAND ---------- df_agg_result_csv=( spark .read .format("csv") .option("header", True) .load(cachPathFull+csvOutFileName_s1+'.csv') .withColumn('rank',func.col('rank').cast('int')) .withColumn('rank (ns)',func.col('rank (ns)').cast('int')) .orderBy(func.col('rank (ns)')) ) # COMMAND ---------- print(Last_ani+" has "+str(df_agg_result_csv.count())+" authors") print(Last_ani+" has "+str(df_agg_result_csv.filter('sm22 IS NOT NULL').count())+" authors with field assigned")# # COMMAND ---------- # table S3 by SM 22 # SM22 SM22-Cat-Name #Auth Cites-25 Cites-50 Cites-75 Cites-90 Cites-95 Cites-99 c-25 c-50 c-75 c-90 c-95 c-99 df_S3_22=( df_agg_result_csv .withColumn('nc'+Y2+Y3,func.col('nc'+Y2+Y3).cast('long')) .withColumn('rec_count',func.count('*').over(Window.partitionBy('sm22'))) .withColumn('nc'+Y2+Y3+'_perc',func.rank().over(Window.partitionBy('sm22').orderBy(func.col('nc'+Y2+Y3).asc()))/func.col('rec_count')) .withColumn('c_perc',func.rank().over(Window.partitionBy('sm22').orderBy(func.asc('c')))/func.col('rec_count')) .groupBy( func.col('sm22').alias('SM22'),func.col('name22').alias('SM22-Cat-Name') ) .agg( func.first('rec_count').alias('#Auth'), func.count(func.expr('IF(`rank (ns)`<=100000,TRUE,NULL)')).alias('#Auth top 100k (ns)'), func.count(func.expr('IF(`rank`<=100000,TRUE,NULL)')).alias('#Auth top 100k'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.25,nc'+Y2+Y3+',NULL)')).alias('Cites-25'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.50,nc'+Y2+Y3+',NULL)')).alias('Cites-50'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.75,nc'+Y2+Y3+',NULL)')).alias('Cites-75'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.90,nc'+Y2+Y3+',NULL)')).alias('Cites-90'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.95,nc'+Y2+Y3+',NULL)')).alias('Cites-95'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.99,nc'+Y2+Y3+',NULL)')).alias('Cites-99'), func.min(func.expr('IF(c_perc>=.25,c,NULL)')).alias('c-25'), func.min(func.expr('IF(c_perc>=.50,c,NULL)')).alias('c-50'), func.min(func.expr('IF(c_perc>=.75,c,NULL)')).alias('c-75'), func.min(func.expr('IF(c_perc>=.90,c,NULL)')).alias('c-90'), func.min(func.expr('IF(c_perc>=.95,c,NULL)')).alias('c-95'), func.min(func.expr('IF(c_perc>=.99,c,NULL)')).alias('c-99'), ) .union( df_agg_result_csv .withColumn('nc'+Y2+Y3,func.col('nc'+Y2+Y3).cast('long')) .withColumn('rec_count',func.count('*').over(Window.partitionBy(func.lit(True)))) .withColumn('nc'+Y2+Y3+'_perc',func.rank().over(Window.partitionBy(func.lit(True)).orderBy(func.col('nc'+Y2+Y3).asc()))/func.col('rec_count')) .withColumn('c_perc',func.rank().over(Window.partitionBy(func.lit(True)).orderBy(func.asc('c')))/func.col('rec_count')) .agg( func.first(func.lit("")).alias('SM22'), func.first(func.lit("TOTAL")).alias('SM22-Cat-Name'), func.first('rec_count').alias('#Auth'), func.count(func.expr('IF(`rank (ns)`<=100000,TRUE,NULL)')).alias('#Auth top 100k (ns)'), func.count(func.expr('IF(`rank`<=100000,TRUE,NULL)')).alias('#Auth top 100k'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.25,nc'+Y2+Y3+',NULL)')).alias('Cites-25'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.50,nc'+Y2+Y3+',NULL)')).alias('Cites-50'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.75,nc'+Y2+Y3+',NULL)')).alias('Cites-75'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.90,nc'+Y2+Y3+',NULL)')).alias('Cites-90'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.95,nc'+Y2+Y3+',NULL)')).alias('Cites-95'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.99,nc'+Y2+Y3+',NULL)')).alias('Cites-99'), func.min(func.expr('IF(c_perc>=.25,c,NULL)')).alias('c-25'), func.min(func.expr('IF(c_perc>=.50,c,NULL)')).alias('c-50'), func.min(func.expr('IF(c_perc>=.75,c,NULL)')).alias('c-75'), func.min(func.expr('IF(c_perc>=.90,c,NULL)')).alias('c-90'), func.min(func.expr('IF(c_perc>=.95,c,NULL)')).alias('c-95'), func.min(func.expr('IF(c_perc>=.99,c,NULL)')).alias('c-99'), ) ) ).repartition(1).write.mode("overwrite").csv('dbfs:'+cachePath+csvOutFileName_s3+'_SM22.csv',header = 'true', quote='"', escape='"') # COMMAND ---------- # table s3 by SM 176 df_S3_176=( df_smc_c.select(func.col('fieldcode').alias('SM22'),func.col('subfieldcode').alias('SM176')) .join( df_agg_result_csv .withColumn('nc'+Y2+Y3,func.col('nc'+Y2+Y3).cast('long')) .withColumn('rec_count',func.count('*').over(Window.partitionBy('sm-1'))) .withColumn('nc'+Y2+Y3+'_perc',func.rank().over(Window.partitionBy('sm-1').orderBy(func.col('nc'+Y2+Y3).asc()))/func.col('rec_count')) .withColumn('c_perc',func.rank().over(Window.partitionBy('sm-1').orderBy(func.asc('c')))/func.col('rec_count')) .groupBy( func.col('sm-1').alias('SM176'),func.col('name1').alias('SM176-Cat-Name') ) .agg( func.first('rec_count').alias('#Auth'), func.count(func.expr('IF(`rank (ns)`<=100000,TRUE,NULL)')).alias('#Auth top 100k (ns)'), func.count(func.expr('IF(`rank`<=100000,TRUE,NULL)')).alias('#Auth top 100k'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.25,nc'+Y2+Y3+',NULL)')).alias('Cites-25'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.50,nc'+Y2+Y3+',NULL)')).alias('Cites-50'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.75,nc'+Y2+Y3+',NULL)')).alias('Cites-75'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.90,nc'+Y2+Y3+',NULL)')).alias('Cites-90'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.95,nc'+Y2+Y3+',NULL)')).alias('Cites-95'), func.min(func.expr('IF(nc'+Y2+Y3+'_perc>=.99,nc'+Y2+Y3+',NULL)')).alias('Cites-99'), func.min(func.expr('IF(c_perc>=.25,c,NULL)')).alias('c-25'), func.min(func.expr('IF(c_perc>=.50,c,NULL)')).alias('c-50'), func.min(func.expr('IF(c_perc>=.75,c,NULL)')).alias('c-75'), func.min(func.expr('IF(c_perc>=.90,c,NULL)')).alias('c-90'), func.min(func.expr('IF(c_perc>=.95,c,NULL)')).alias('c-95'), func.min(func.expr('IF(c_perc>=.99,c,NULL)')).alias('c-99'), ),['SM176'] ) ).repartition(1).write.mode("overwrite").csv('dbfs:'+cachePath+csvOutFileName_s3+'_SM176.csv',header = 'true', quote='"', escape='"') # COMMAND ---------- ( df_agg_result_csv .filter('rank <= 100000 OR `rank (ns)` <= 100000') .repartition(1) .write .mode("overwrite") .csv(cachePath+csvOutFileName_s1+'_100K.csv',header = 'true', quote='"', escape='"') )