2023 VIS Area Curation Committee Executive Summary

Summary

This report summarizes the findings, recommendations, and process by the VIS Area Curation Committee (ACC) regarding the areas and keywords used for paper submissions to IEEE VIS 2022. It is based on the 2021 ACC committee report. According to the Charter, the goal of this committee is to analyze and report how submissions made use of the areas and keywords to describe their contribution. It is important to understand when these descriptors no longer adequately cover the breadth of research presented at VIS.

We use submission and bidding information from VIS 2022 to analyze the impact of moving to an area model.

Given the information we have access to, the move appears to be broadly successful, and we do not make recommendations for any changes at this time.

Our analysis suggests that submissions are relatively balanced across areas, keywords are (with a small exception) well distributed, and the unified PC appears to provide broad and overlapping coverage.

The full data and source code to rebuild this project are available here.

Committee members 2022: Steven Drucker (chair), Ingrid Hotz, David Laidlaw, Heike Leitte, Torsten Möller, Carlos Scheidegger, Hendrik Strobelt, Shigeo Takahashi, Penny Rheingans.

Committee members 2021: Alex Endert (chair), Steven Drucker (next chair), Issei Fujishiro, Christoph Garth, Heidi Lam, Heike Leitte, Carlos Scheidegger, Hendrik Strobelt, Penny Rheingans.

Last edited: 2022-09-29.

Code

import itertools

import pandas as pd
import numpy as np

# Import the necessaries libraries
import plotly.offline as pio
import plotly.graph_objs as go
import plotly.express as px
#pio.renderers.default = "notebook_connected"
# Set notebook mode to work in offline
pio.init_notebook_mode()
width = 750

import sqlite3

#### Data Preparation

# static data – codes -> names etc.
staticdata = dict(
    decision = { 
        'C': 'Confer vs. cond Accept', # relevant for the 2020 and 2021 data have a different meaning
        'A': 'Accept', # for the 2020 data
        'A2': 'Accept', # after the second round, should be 120 in 2022
        'R': 'Reject', # reject after the first round -- should be 322 in 2022
        'R2': 'Reject in round 2', # reject after the second round -- should be 2 in 2022
        'R-2nd': 'Reject in round 2', 
        'DR-S': 'Desk Reject (Scope)', # should be 7 in 2022
        'DR-P': 'Desk Reject (Plagiarism)', # should be 4 in 2022
        'AR-P': 'Admin Reject (Plagiarism)', # should be 1 in 2022
        'DR-F': 'Desk Reject (Format)', # should be 4 in 2022
    },
    FinalDecision = { # Just flatten to Accept and Reject
        'C': 'Accept', 
        'A': 'Accept', # for the 2020 data
        'A2': 'Accept', # after the second round, should be 120 in 2022
        'R': 'Reject', # reject after the first round -- should be 322 in 2022
        'R2': 'Reject', # reject after the second round -- should be 2 in 2022
        'R-2nd': 'Reject', 
        'DR-S': 'Reject', # should be 7 in 2022
        'DR-P': 'Reject', # should be 4 in 2022
        'AR-P': 'Reject', # should be 1 in 2022
        'DR-F': 'Reject', # should be 4 in 2022
    },
    area = {
        'T&E': 'Theoretical & Empirical',
        'App': 'Applications',
        'S&R': 'Systems & Rendering',
        'R&I': 'Representations & Interaction',
        'DTr': 'Data Transformations',
        'A&D': 'Analytics & Decisions',
    },
    bid = { 
        0: 'no bid',
        1: 'want',
        2: 'willing',
        3: 'reluctant',
        4: 'conflict'
    },
    stat = {
        'Prim': 'Primary', 
        'Seco': 'Secondary'
    },
    keywords = pd.read_csv("./data/2021/keywords.csv", sep=';'), # 2021 is correct as there was no new keywords file in 2022
    colnames = {
        'confsubid': 'Paper ID',
        'rid': 'Reviewer',
        'decision': 'Decision',
        'area': 'Area',
        'stat': 'Role',
        'bid': 'Bid'
    }
)

dbcon = sqlite3.connect('./data/vis-area-chair.db')

submissions_raw20 = pd.read_sql_query('SELECT * from submissions WHERE year = 2020', dbcon, 'sid')
submissions_raw21 = pd.read_sql_query('SELECT * from submissions WHERE year = 2021', dbcon, 'sid')
submissions_raw22 = pd.read_sql_query('SELECT * from submissions WHERE year = 2022', dbcon, 'sid')
submissions_raw23 = pd.read_sql_query('SELECT * from submissions WHERE year = 2023', dbcon, 'sid')
submissions_raw = pd.read_sql_query('SELECT * from submissions', dbcon, 'sid')
#print(submissions_raw23)

submissions = (submissions_raw
    .join(
        pd.read_sql_query('SELECT * from areas', dbcon, 'aid'), 
        on='aid'
    )
    .assign(Keywords = lambda df: (pd
        .read_sql_query('SELECT * FROM submissionkeywords', dbcon, 'sid')
        .loc[df.index]
        .join(
            pd.read_sql_query('SELECT * FROM keywords', dbcon, 'kid'), 
            on='kid'
        )
        .keyword
        .groupby('sid')
            .apply(list)
    ))
    .assign(**{'# Keywords': lambda df: df.Keywords.apply(len)})
    .assign(**{'FinalDecision': lambda df: df['decision']})
    .replace(staticdata)
    .rename(columns = staticdata['colnames'])
    .drop(columns = ['legacy', 'aid'])
#    .set_index('sid')
#    .set_index('Paper ID')
# note -- I changed the index, since 'Paper ID' was not unique for multiple years.
# By not setting the index to 'Paper ID' the index remains with 'sid'.
# However, 'sid' is used as a unique index in the creation of the database anyways.
)

# replace the old 'Paper ID' with a unique identifier, so that the code from 2021 will work
submissions = submissions.rename(columns = {'Paper ID':'Old Paper ID'})
submissions.reset_index(inplace=True)
submissions['Paper ID'] = submissions['sid']
submissions = submissions.set_index('Paper ID')
#submissions colums: (index), sid (unique id), Paper ID (unique), Old Paper ID, Decision, year, Area, Keywords (as a list), # Keywords

all_years = submissions['year'].unique()

#rates_decision computes the acceptance rates (and total number of papers) per year
#rates_decision: (index), Decision, year, count, Percentage
rates_decision = (submissions
    .value_counts(['Decision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
rates_decision['Percentage'] = rates_decision.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)
rates_decision = rates_decision.round({'Percentage': 1})
#rates_decision computes the acceptance rates (and total number of papers) per year
#rates_decision: (index), Decision, year, count, Percentage
rates_decision_final = (submissions
    .value_counts(['FinalDecision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
rates_decision_final['Percentage'] = rates_decision_final.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)
rates_decision_final = rates_decision_final.round({'Percentage': 1})
#submissions
#bids_raw: (index), Reviewer ID, sid (unique paper identifier over mult years), match score, bid of the reviewer, role of the reviewer, Paper ID
bids_raw = (pd
    .read_sql_query('SELECT * from reviewerbids', dbcon)
    .merge(submissions_raw['confsubid'], on='sid')
    .replace(staticdata)
    .rename(columns = staticdata['colnames'])
)
#bids_raw

## Renaming Paper ID to Old Paper ID, setting Paper ID to sid, keeping all 3 for now...
bids_raw = bids_raw.rename(columns = {'Paper ID':'Old Paper ID'})
bids_raw['Paper ID'] = bids_raw['sid']
# bids = Reviewer, sid, Bid (how the reviewer bid on this paper)
#      doesn't include review/sid that were not bid for [.query('Bid != "no bid"')]
bids = (bids_raw
    .query('Bid != "no bid"')
# Paper ID is not unique over multiple years!
#    .drop(columns = ['sid'])
#    [['Reviewer','Paper ID', 'Bid']]
    [['Reviewer','sid', 'Paper ID', 'Bid']]
    .reset_index(drop = True)
)

# matchscores becomes a table to reviewer/sid with the match scores
# many of these will be "NaN" since we now have multiple years together.
# we need to check whether the reviewer IDs remain unique across the years!
matchscores = (bids_raw
# Paper ID is not unique over multiple years!
#    [['Reviewer','Paper ID','match']]
    [['Reviewer','sid','Paper ID','match']]
# Paper ID is not unique over multiple years!
#    .set_index(['Reviewer', 'Paper ID'])
    .set_index(['Reviewer', 'Paper ID'])
    .match
    .unstack(level=1)
)

# assignments = Reviewer, sid, Role (primary, secondary)
#      doesn't include review/sid that were not assigned [.query('Role != ""')]
assignments = (bids_raw
    .query('Role != ""')
# Paper ID is not unique over multiple years!
#    [['Reviewer', 'Paper ID', 'Role']]
    [['Reviewer', 'sid', 'Paper ID', 'Role']]
    .reset_index(drop = True)
)

del dbcon

#### Plot Defaults

acc_template = go.layout.Template()

acc_template.layout = dict(
    font = dict( 
        family='Fira Sans',
        color = 'black',
        size = 13
    ),
    title_font_size = 14,
    plot_bgcolor = 'rgba(255,255,255,0)',
    paper_bgcolor = 'rgba(255,255,255,0)',
    margin = dict(pad=10),
    xaxis = dict(
        title = dict( 
            font = dict( family='Fira Sans Medium', size=13 ),
            standoff = 10
        ),
        gridcolor='lightgray',
        gridwidth=1,
        automargin = True,
        fixedrange = True,
    ),
    yaxis = dict(
        title = dict( 
            font = dict( family='Fira Sans Medium', size=13 ),
            standoff = 10,
        ),
        gridcolor='lightgray',
        gridwidth=1,
        automargin = True,
        fixedrange = True,
    ),
    legend=dict(
        title_font_family="Fira Sans Medium",
    ),
    colorway = px.colors.qualitative.T10,
    hovermode = 'closest',
    hoverlabel=dict(
        bgcolor="white",
        bordercolor='lightgray',
        font_color = 'black',
        font_family = 'Fira Sans'
    ),
)

acc_template.data.bar = [dict(
    textposition = 'inside',
    insidetextanchor='middle',
    textfont_size = 12,
)]

px.defaults.template = acc_template

px.defaults.category_orders = {
    'Decision': list(staticdata['decision'].values()),
    'FinalDecision':  list(staticdata['FinalDecision'].values()),
    'Area': list(staticdata['area'].values()),
    'Short Name': staticdata['keywords']['Short Name'].tolist(),
}

config = dict(
    displayModeBar = False,
    scrollZoom = False,
    responsive = False
)

def aspect(ratio):
    return { 'width': width, 'height': int(ratio*width) }

# useful data sub-products

#k_all columns: (index), Paper ID, Old Paper ID, Decision, year, Area, Keywords (as a list), # Keywords, Keyword, Category, Subcategory, Short Name, Description
k_all = (submissions
    .join(submissions['Keywords']
        .explode()
        .rename('Keyword')
    )
    .reset_index(level = 0)
    .merge(staticdata['keywords'], on='Keyword')
)

# (Old) Paper ID is not unique, however, the 'sid' is (which is the current index)
#k_all.reset_index(inplace=True)
#k_all.rename(columns = {'sid':'Paper ID'},inplace = True)
#k_all = k_all.merge(staticdata['keywords'], on='Keyword')
#k_all

#k_total columns: Category, Subcategory, Short Name, Keyword, Description, #Submissions, year
#  counts the total number of submissions per keyword and year
k_total = staticdata['keywords'].merge(
    k_all.value_counts(['Short Name','year'])
         .rename('# Submissions')
         .reset_index(),
#    on = 'Short Name',
    how = 'right'
#    how = 'outer'
)

#k_cnt: how often was a particular keyword used among all submissions within a year????
#k_cnt columns: (index), Short Name, year, c, Category, Subcategory, Keyword, Description
# not clear how k_cnt and k_total differ!
k_cnt = (k_all
    .value_counts(['Short Name','year'], sort=False)
    .rename('c')
    .to_frame()
    .reset_index()
    .merge(staticdata['keywords'], on='Short Name')
)

Highlights

Some highlights of the data to support our current recommendations:

Acceptance rates have stayed approximately the same from 2020-2022 (26.8%, 24.9% and 26.1%) though there WAS a significant drop off (24.4%) in submissions between 2020 and 2021 (pandemic related?)

Code

fig = px.bar(rates_decision_final,
    x = 'Percentage',
    y = 'year',
    barmode = 'stack',
    orientation = 'h',
    color = 'FinalDecision',
    text = 'Percentage',
    custom_data = ['FinalDecision','count'],
).update_layout(
    title = 'Submissions',
    xaxis_title = 'Percentage of Submissions',
    **aspect(0.35)
).update_traces(
    hovertemplate = '%{customdata[1]} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

fig = px.bar(rates_decision_final,
    x = 'count',
    y = 'year',
    barmode = 'stack',
    orientation = 'h',
    color = 'FinalDecision',
    text = 'count',
    custom_data = ['FinalDecision'],
).update_layout(
    title = 'Submissions',
    xaxis_title = 'Number of Submissions',
    **aspect(0.35)
).update_traces(
    hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

Submissions across the (reformulated) areas are stable between 2021 and 2022

Code

tmp = (submissions
    .value_counts(['Area', 'FinalDecision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)

data=[]
count=0
recent_years = [2021, 2022, 2023]
for my_year in recent_years:
    count=count+1
    trace1=go.Bar(
        x=tmp[tmp['year']==my_year]["Area"],
        y=tmp[tmp['year']==my_year]['count'],
        customdata = tmp[tmp['year']==my_year]['FinalDecision'],
        hovertemplate="%{y} papers were %{customdata} in",
        name=f"{my_year}",
        offsetgroup=count,
    )
    data.append(trace1)

fig2 = go.Figure(
    data=data,
    layout=go.Layout(
        title="Comparing # submissions 2021, 2022 and 2023",
        xaxis_title="Areas"
    )
)
fig2.show()

And frequencies of the use of keywords range from 5 to 120. The keywords with the highest number of occurrences are not very useful for categorizing papers, but they are very meaningful, and differentiation works effectively with accompanying keywords. We believe that having five papers that use a keyword is sufficient to warrant retaining it.

Code

# do a manual histogram to include non-specified keywords

px.bar(k_total,
    x = 'Short Name',
    y = '# Submissions',
    facet_row='year',
    category_orders={'year': [2023, 2022, 2021, 2020]},
).update_traces(
    hovertemplate = "'%{x}' specified in %{y} submissions<extra></extra>",
).update_layout(
    xaxis_tickfont_size = 8,
    xaxis_dtick = 1,
    yaxis_dtick = 50,
    hovermode = 'closest',
    title = 'Frequency of keywords across submissions',
    **aspect(0.8)
).show(config=config)

Acceptance rates have been rather equal across areas in 2021, but not so in 2022. “Representation and Interaction” lost the most, while “Data Transformations” and “Theoretical and Empirical” gained the most. Perhaps this is worth watching …

Code

recent_submissions = submissions[submissions['year'] != 2020]
tmptotal = (recent_submissions
    .value_counts(['Area', 'year'])
    .reset_index()
    .rename(columns = {0: 'total'})
)
tmp = (recent_submissions
    .value_counts(['Area', 'FinalDecision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
tmpfinal = pd.merge(left=tmp, right=tmptotal, on=['Area','year'])
tmpfinal['percentage']= round(tmpfinal['count']/tmpfinal['total'] *1000)/10.0
fig = px.bar(tmpfinal,
    x = 'year',
    y = 'percentage',
    barmode = 'stack',
    orientation = 'v',
    color = 'FinalDecision',
    text = 'percentage',
    custom_data = ['FinalDecision'],
    facet_col='Area',
    category_orders = {"year": [2021,2022, 2023]},
    facet_col_spacing=0.06, # default is 0.03
    ).update_layout(
        title = 'Submissions by area and year',
        xaxis_title = 'year',
        legend=dict(
            yanchor="top",
            y=1,  # Adjust legends y-position
            xanchor="left",
            x=1.08,  # ... and x-position to avoid overlapping
        ),
        **aspect(0.8)
    ).update_xaxes(type='category').update_traces(
        hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
    )
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i,a in enumerate(fig.layout.annotations):
    if (i%2):
        a.update(yshift=-15)
fig.show(config=config)

There seems to be a trend toward more “Theory + Empirical” as well as “Analytics + Decisions” submissions. But we only have 2 years – worth to be observing.

Code

recent_submissions = submissions[submissions['year'] != 2020]
tmptotal = (recent_submissions
    .value_counts(['Area', 'year'])
    .reset_index()
    .rename(columns = {0: 'total'})
)
tmp = (recent_submissions
    .value_counts(['Area', 'FinalDecision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
tmpfinal = pd.merge(left=tmp, right=tmptotal, on=['Area','year'])
tmpfinal['percentage']= round(tmpfinal['count']/tmpfinal['total'] *1000)/10.0
tmpfinal['yearcat'] = tmpfinal['year'].astype('category')
fig = px.bar(tmpfinal,
    x = 'year',
    y = 'count',
    barmode = 'stack',
    orientation = 'v',
    color = 'yearcat',
    text = 'count',
    custom_data = ['FinalDecision'],
    facet_col='Area',
    category_orders = {"year": [2021,2022, 2023]},
    facet_col_spacing=0.06, # default is 0.03
    ).update_layout(
        title = 'Submissions by area and year',
        xaxis_title = 'year',
        **aspect(0.8)
    ).update_xaxes(type='category').update_traces(
        hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
    )
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i,a in enumerate(fig.layout.annotations):
    if (i%2):
        a.update(yshift=-15)
fig.show(config=config)

Deeper data investigation

This report is generated by members of the ACC for the current year, and prepared for the VSC. Upon review, it will be linked from the IEEE VIS website. The conclusions and discussion points are based on submission and reviewer data from IEEE VIS 2022 (and previous years). The report and analysis performed is focused on the use of keywords, areas, and reviewer matching. Thus, there are likely other aspects of conference organization which are not covered (but could be considered).

The report is broken down into the following sections. After the summary at the beginning, the data and analysis process is described. It shows which data we used, where it is stored, and how it is obtained. These processes can be adapted for future years of this committee.

(NB: Some of the plots shown above are repeated here from the highlights for the sake of completeness.)

Data and Process

We analyzed anonymized data containing information about the full paper submissions to VIS 2022, the reviews of these submissions, and the IPC bidding preferences. We analyzed this data to understand how well the areas and keywords characterize the body of work submitted this year. We also analyzed the IPC bidding information to understand how well the expertise of the IPC members covers the submissions. Below, we show highlights of our findings.

Note that in the the analysis that follows, the submission/paper IDs and reviewer IDs are anonymized through a randomizer, and are not the IDs used in PCS submissions and reviewers.

The data used to perform this analysis is a combination of paper submission data and reviewer bidding data. Both sets were anonymized to minimize the ability to identify IPC members, authors, or reviewers. The analysis of the data in this year uses the anonymized CSV files obtained directly from PCS. You can see the source code used to process and generate the plots in this document by clicking on the “Code” buttons, which will fold out the Python code used. The anonymization script that was used is located in the anonymization-scripts folder (and may be needed to be updated to correspond with changes made in PCS). In order to get ALL the data, it is current run by James at PCS who sends the resultant anonymized files to the ocmmittee where they are stored in the corresponding year folder.

In order to facilitate longitudinal studies of this data, we are also providing a sqlite database with the 2020, 2021, and 2022 data in an attempt to make it easier to incorporate future years. This database (as well as the source code of this document) can be found here

Sanity Checks

We include some sanity checks on the data in order to make sure the data has been processed correctly. In 2022, we should have:

120 papers accepted after the second round
322 papers rejected after the first round
2 papers rejected after the second round
1 paper administrative reject because of a plagiarism problem
4 papers desk rejected because of a format issue
4 papers desk rejected because of a plagiarism problem
7 papers desk rejected because of a scope problem

Code

#rates_decision computes the acceptance rates (and total number of papers) per year
#rates_decision: (index), Decision, year, count, Percentage
rates_decision = (submissions
    .value_counts(['Decision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
rates_decision['Percentage'] = rates_decision.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)
rates_decision = rates_decision.round({'Percentage': 1})
#rates_decision computes the acceptance rates (and total number of papers) per year
#rates_decision: (index), Decision, year, count, Percentage
rates_decision_final = (submissions
    .value_counts(['FinalDecision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
rates_decision_final['Percentage'] = rates_decision_final.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)
rates_decision_final = rates_decision_final.round({'Percentage': 1})

Code

rates_decision_final.sort_values(by=['year', 'FinalDecision'], ascending=[False, True], ignore_index=True)

	FinalDecision	year	count	Percentage
0	Accept	2023	139	25.8
1	Reject	2023	400	74.2
2	Accept	2022	120	26.1
3	Reject	2022	340	73.9
4	Accept	2021	110	24.9
5	Reject	2021	332	75.1
6	Accept	2020	157	26.8
7	Reject	2020	428	73.2

Code

fig = px.bar(rates_decision,
    x = 'count',
    y = 'year',
    barmode = 'stack',
    orientation = 'h',
    color = 'Decision',
    text = 'count',
    custom_data = ['Decision'],
).update_layout(
    title = 'Submissions',
    xaxis_title = 'Number of Submissions',
    **aspect(0.45)
).update_traces(
    hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

fig = px.bar(rates_decision,
    x = 'Percentage',
    y = 'year',
    barmode = 'stack',
    orientation = 'h',
    color = 'Decision',
    text = 'Percentage',
    custom_data = ['Decision','count'],
).update_layout(
    title = 'Submissions',
    xaxis_title = 'Percentage of Submissions',
    **aspect(0.45)
).update_traces(
    hovertemplate = '%{customdata[1]} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

The wide ranges of decisions can be collapsed into more straightforward Accept or Reject (where Reject includes desk rejects, admin rejects, and rejections in round 1 or 2)

Code

fig = px.bar(rates_decision_final,
    x = 'count',
    y = 'year',
    barmode = 'stack',
    orientation = 'h',
    color = 'FinalDecision',
    text = 'count',
    custom_data = ['FinalDecision'],
).update_layout(
    title = 'Submissions',
    xaxis_title = 'Number of Submissions',
    **aspect(0.45)
).update_traces(
    hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

fig = px.bar(rates_decision_final,
    x = 'Percentage',
    y = 'year',
    barmode = 'stack',
    orientation = 'h',
    color = 'FinalDecision',
    text = 'Percentage',
    custom_data = ['FinalDecision','count'],
).update_layout(
    title = 'Submissions',
    xaxis_title = 'Percentage of Submissions',
    **aspect(0.35)
).update_traces(
    hovertemplate = '%{customdata[1]} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

Submissions per Area.

We wanted to understand how submissions were distributed by area, including acceptance decisions. Submissions to each area were within reasonable upper and lower limits, and decisions did not appear partial to any individual area.

Code


def group_stat(g):
    return pd.DataFrame({
        '# Submissions': g,
        '% Submissions': round(g/g.sum()*100,1),
        'Total': g.sum()
    })

tmp = (submissions
    .value_counts(['Area', 'Decision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)

fig = px.bar(tmp,
    x = 'count',
    y = 'Area',
    barmode = 'stack',
    orientation = 'h',
    color = 'Decision',
    text = 'count',
    custom_data = ['Decision'],
    facet_row='year',
    category_orders={'year': [2023,2022, 2021, 2020]},
    ).update_layout(
        title = 'Submissions by area and year',
        xaxis_title = 'Number of Submissions',
        yaxis=dict(
        tickfont=dict(size=12),  # Adjust y-label fontsize 
        ),
        **aspect(1.3)
    ).update_traces(
        hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
        texttemplate='%{text}',
        textangle=0  # Force labels to have horizontal orientation
    ).show(config=config)


fig = px.bar(tmp,
    x = 'count',
    y = 'Area',
    barmode = 'stack',
    orientation = 'h',
    color = 'Decision',
    text = 'count',
    custom_data = ['Decision'],
).update_layout(
    title = 'Submissions by area all years',
    xaxis_title = 'Number of Submissions all years',
    yaxis=dict(
        tickfont=dict(size=12),  # Adjust y-label fontsize 
    ),
    **aspect(0.5)
).update_traces(
    hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
).show(config=config)

data=[]
count=0
for my_year in all_years:
    count=count+1
    trace1=go.Bar(
        x=tmp[tmp['year']==my_year]["Area"],
        y=tmp[tmp['year']==my_year]['count'],
        customdata = tmp[tmp['year']==my_year]['Decision'],
        hovertemplate="%{y} papers were %{customdata[0]} in",
        name=f"{my_year}",
        offsetgroup=count,
    )
    data.append(trace1)

fig2 = go.Figure(
    data=data,
    layout=go.Layout(
        title="Comparing # submissions 2021, 2022 and 2023",
        xaxis_title="Areas"
    )
)
fig2.show()

Code

recent_submissions = submissions[submissions['year'] != 2020]
tmptotal = (recent_submissions
    .value_counts(['Area', 'year'])
    .reset_index()
    .rename(columns = {0: 'total'})
)
tmp = (recent_submissions
    .value_counts(['Area', 'FinalDecision', 'year'])
    .reset_index()
    .rename(columns = {0: 'count'})
)
tmpfinal = pd.merge(left=tmp, right=tmptotal, on=['Area','year'])
tmpfinal['percentage']= round(tmpfinal['count']/tmpfinal['total'] *1000)/10.0
fig = px.bar(tmpfinal,
    x = 'percentage',
    y = 'Area',
    barmode = 'stack',
    orientation = 'h',
    color = 'FinalDecision',
    text = 'percentage',
    custom_data = ['FinalDecision'],
    facet_row='year',
    ).update_layout(
        title = 'Submissions by area and year',
        xaxis_title = 'Number of Submissions',
        **aspect(0.8)
    ).update_traces(
        hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
        texttemplate='%{text}',
        textangle=0  # Force labels to have horizontal orientation
    ).show(config=config)



tmpfinal2 = tmpfinal.groupby(['Area','FinalDecision']).sum().reset_index()
tmpfinal2['newpercentage'] = round(tmpfinal['count']/tmpfinal['total']*1000)/10

fig = px.bar(tmpfinal2,
    x = 'newpercentage',
    y = 'Area',
    barmode = 'stack',
    orientation = 'h',
    color = 'FinalDecision',
    text = 'newpercentage',
    custom_data = ['FinalDecision'],
).update_layout(
    title = 'Submissions by area for all recent years',
    xaxis_title = 'Number of Submissions for all recent years',
    **aspect(0.35)
).update_traces(
    hovertemplate = '%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',
    texttemplate='%{text}',
    textangle=0  # Force labels to have horizontal orientation
).show(config=config)

data=[]
count=0
recent_years = all_years[all_years != 2020]
for my_year in recent_years:
    count=count+1
    trace1=go.Bar(
        x=tmp[tmp['year']==my_year]["Area"],
        y=tmp[tmp['year']==my_year]['count'],
        customdata = tmp[tmp['year']==my_year]['FinalDecision'],
        hovertemplate="%{y} papers were %{customdata} in",
        name=f"{my_year}",
        offsetgroup=count,
    )
    data.append(trace1)

fig2 = go.Figure(
    data=data,
    layout=go.Layout(
        title="Comparing # submissions 2021, 2022 and 2023",
        xaxis_title="Areas"
    )
)
fig2.show()

Submissions and Keywords used

We also analyzed how often keywords were used in the submissions. The frequency of keywords used is reasonable. The one exception which should be watched for next year is “Application-Motivated Visualization”, which may require further specification or description.

How many papers were submitted to each area, and what is the breakdown of decisions?

Code

# do a manual histogram to include non-specified keywords

px.bar(k_total,
    x = 'Short Name',
    y = '# Submissions',
    facet_row='year',
    category_orders={'year': [2023, 2022, 2021, 2020]},
).update_traces(
    hovertemplate = "'%{x}' specified in %{y} submissions<extra></extra>",
).update_layout(
    xaxis_tickfont_size = 8,
    xaxis_dtick = 1,
    yaxis_dtick = 50,
    hovermode = 'closest',
    title = 'Frequency of keywords across submissions',
    **aspect(0.8)
).show(config=config)

Code

# do a manual histogram to include non-specified keywords

px.bar(k_total,
    x = 'Short Name',
    y = '# Submissions',
    color = 'Category',
    facet_row='year',
    category_orders={'year': [2020, 2021, 2022, 2023]},
).update_traces(
    hovertemplate = "'%{x}' specified in %{y} submissions<extra></extra>",
).update_layout(
    xaxis_tickfont_size = 8,
    xaxis_dtick = 1,
    yaxis_dtick = 50,
    hovermode = 'closest',
    title = 'Frequency of keywords across submissions',
    **aspect(0.8)
).show(config=config)

Code

k_cnt = staticdata['keywords'].merge(
    pd.DataFrame(staticdata['area'].values(), columns = ['Area']), 
    how = 'cross'
).merge(
    k_all
        .value_counts(['Short Name', 'Area'])
        .groupby(level=0)
        .apply(group_stat)
        .reset_index(),
    how = 'outer'
).fillna(1e-10) # needed for sorting, Plotly bug?
# do manual histogram without 2020 areas
k_cnt_new=k_cnt[~k_cnt.Area.isin(['VAST', 'SciVis', 'InfoVis'])]
# with 2020 absolute
px.bar(k_cnt,
    x = 'Short Name',
    y = '# Submissions',
    color = 'Area',
    custom_data = ['Area']
).update_traces(
    hovertemplate = 'Keyword "%{x}" specified by %{y} submissions from area "%{customdata}"<extra></extra>'
).update_layout(
    barmode = 'stack',
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    xaxis_fixedrange = True,
    yaxis_fixedrange = True,
    xaxis_categoryorder = 'total descending',
    title = 'Frequency of keywords across submissions, by area (for all years)',
    **aspect(0.5),
    
).show(config=config)

k_cnt['Submissions_pct'] = k_cnt.groupby(['Short Name'])['# Submissions'].transform(lambda x: x/x.sum()*100)
k_cnt = k_cnt.round({'Percentage': 1})

# make sure order is consistent over absolute & percentage plot
k_cnt['Totals']=k_cnt.groupby(['Short Name'])['# Submissions'].transform(lambda x: x.sum())
k_cnt=k_cnt.sort_values('Totals', ascending=False)

# with 2020 in percent
px.bar(k_cnt,
    x = 'Short Name',
    y = 'Submissions_pct',
    color = 'Area',
    custom_data = ['Area']
).update_traces(
    hovertemplate = 'Keyword "%{x}" specified by %{y} submissions from area "%{customdata}"<extra></extra>'
).update_layout(
    barmode = 'stack',
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    xaxis_fixedrange = True,
    yaxis_fixedrange = True,
    xaxis_categoryorder = 'trace',
    title = 'Frequency of keywords across submissions, by area (for all years)',
    yaxis_title = '% of Submissions',
    **aspect(0.5)
).show(config=config)

# without 2020 absolute
px.bar(k_cnt_new,
    x = 'Short Name',
    y = '# Submissions',
    color = 'Area',
    custom_data = ['Area']
).update_traces(
    hovertemplate = 'Keyword "%{x}" specified by %{y} submissions from area "%{customdata}"<extra></extra>'
).update_layout(
    barmode = 'stack',
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    xaxis_fixedrange = True,
    yaxis_fixedrange = True,
    xaxis_categoryorder = 'total descending',
    title = 'Frequency of keywords across submissions, by area (excluding 2020)',
    **aspect(0.5)
).show(config=config)


k_cnt_new['Submissions_pct'] = k_cnt_new.groupby(['Short Name'])['# Submissions'].transform(lambda x: x/x.sum()*100)
k_cnt_new = k_cnt_new.round({'Percentage': 1})

# make sure order is consistent over absolute & percentage plot
k_cnt_new['Totals']=k_cnt_new.groupby(['Short Name'])['# Submissions'].transform(lambda x: x.sum())
k_cnt_new=k_cnt_new.sort_values('Totals', ascending=False)

# without 2020 in percent
px.bar(k_cnt_new,
    x = 'Short Name',
    y = 'Submissions_pct',
    color = 'Area',
    custom_data = ['Area']
).update_traces(
    hovertemplate = 'Keyword "%{x}" specified by %{y} submissions from area "%{customdata}"<extra></extra>'
).update_layout(
    barmode = 'stack',
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    xaxis_fixedrange = True,
    yaxis_fixedrange = True,
    xaxis_categoryorder = 'trace',
    title = 'Frequency of keywords across submissions, by area (excluding 2020)',
    yaxis_title = '% of Submissions',
    **aspect(0.5)
).show(config=config)

How are keywords distributed across areas?

Code

# do a manual histogram to include non-specified keywords
k_cnt = staticdata['keywords'].merge(
    pd.DataFrame(staticdata['area'].values(), columns = ['Area']), 
    how = 'cross'
).merge(
    k_all
        .value_counts(['Short Name', 'Area'])
        .rename('# Submissions')
        .reset_index(),
    how = 'outer'
).fillna(1e-10) # needed for sorting, Plotly bug?

px.bar(k_cnt,
    x = 'Short Name',
    y = '# Submissions',
    color = 'Area',
    custom_data = ['Area']
).update_traces(
    hovertemplate = 'Keyword "%{x}" specified by %{y} submissions from area "%{customdata}"<extra></extra>'
).update_layout(
    barmode = 'stack',
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    xaxis_fixedrange = True,
    yaxis_fixedrange = True,
    xaxis_categoryorder = 'total descending',
    title = 'Frequency of keywords across submissions, by area',
    **aspect(0.5)
).show(config=config)

How many submissions specified a given number of keywords?

Code

tmp = (submissions
    .value_counts(['# Keywords', 'Area'])
    .rename('# Submissions')
    .reset_index()
)

px.bar(tmp,
    x = '# Keywords', 
    y = '# Submissions',
    barmode = 'stack',
    color = 'Area',
    custom_data=['Area'],
).update_traces(
    hovertemplate = '%{y} submissions specified %{x} keywords in area "%{customdata}"<extra></extra>',
).update_layout(
    xaxis_dtick = 1,
    title = 'Keyword count per submission',
    **aspect(0.5)
).show(config=config)

Does keyword count correlate with decision?

Code

tmp = (submissions
    .assign(**{'# Keywords':
        submissions['# Keywords']
            .map(lambda x: str(x) if x < 10 else '≥10')
    })
    .value_counts(['# Keywords', 'Decision'])
    .groupby(level=0)
    .apply(group_stat)
    .reset_index()
)

px.bar(tmp,
    x = '# Keywords', 
    y = '# Submissions',
    barmode = 'stack',
    color = 'Decision',
    custom_data=['Decision', '% Submissions', 'Total'],
).update_traces(
    hovertemplate = '%{y} (%{customdata[1]}%) of %{customdata[2]} submissions with %{x} keywords had decision "%{customdata[0]}"<extra></extra>',
).update_layout(
    xaxis_dtick = 1,
    xaxis_type = 'category',
    xaxis_categoryorder = 'category ascending',
    title = 'Decisions by keyword count',
    **aspect(0.5)
).show(config=config)


px.bar(tmp,
    x = '# Keywords', 
    y = '% Submissions',
    barmode = 'stack',
    color = 'Decision',
    custom_data=['Decision', '# Submissions', 'Total'],
).update_traces(
    hovertemplate = '%{y}% (%{customdata[1]} in total) of %{customdata[2]} submissions with %{x} keywords had decision "%{customdata[0]}"<extra></extra>',
).update_layout(
    xaxis_dtick = 1,
    xaxis_type = 'category',
    xaxis_categoryorder = 'category ascending',
    title = 'Decisions by keyword count',
    **aspect(0.5)
).show(config=config)

Do specific keywords correlate with decision?

Code

# do a manual histogram to include non-specified keywords


k_dec = (pd.crosstab(k_all["Short Name"], k_all["FinalDecision"]).stack() ## changed this from value_counts to crosstab, to include counts of 0, which plotly's sorting seems to need to work correctly
    .groupby(level = 0)
    .apply(group_stat)
    .reset_index()
)


k_dec=k_dec.sort_values('Total', ascending=False)


px.bar(k_dec,
    x = 'Short Name',
    y = '# Submissions',
    color = 'FinalDecision',
    custom_data = ['FinalDecision', '% Submissions', 'Total'],
).update_layout(
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    title = 'Decision by presence of keyword',
    **aspect(0.4),
    xaxis_categoryorder = 'trace' 
).update_traces(
    hovertemplate = "%{y} of %{customdata[2]} submissions (%{customdata[1]}%) specifying keyword '%{x}' had decision '%{customdata[0]}<extra></extra>"
).show(config=config)



px.bar(k_dec,
    x = 'Short Name',
    y = '% Submissions',
    color = 'FinalDecision',
    custom_data = ['FinalDecision', '# Submissions', 'Total'],
).update_layout(
    xaxis_categoryorder = 'trace',
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    xaxis_fixedrange = True,
    yaxis_fixedrange = True,
    title = 'Decision by presence of keyword',
    **aspect(0.4),
    
).update_traces(
    hovertemplate = "%{y}% of %{customdata[2]} submissions (%{customdata[1]} in total) specifying keyword '%{x}' had decision '%{customdata[0]}<extra></extra>"
).show(config=config)

How often are keywords “esoteric”, i.e. used alone?

Code

tmp = (k_all.set_index('Paper ID')
    .value_counts(['Short Name', 'Category', '# Keywords'])
    .rename('# Submissions')
    .reset_index()
    .assign(**{'# Co-Keywords': (lambda x: x['# Keywords']-1)})
)

px.box(tmp,
    x = 'Short Name',
    y = '# Co-Keywords',
    color = 'Category',
).update_layout(
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    **aspect(0.4)
).update_traces(
    width = .5,
    line_width = 1,
).show(config=config)

Code

px.strip(tmp,
    x = 'Short Name',
    y = '# Co-Keywords',
    color = 'Category',
    
).update_layout(
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    **aspect(0.4),
    xaxis_categoryorder = "mean descending",
).update_traces(
    #jitter=0.3,
    width = .5,
    line_width = 1,
).show(config=config)

How often are pairs of keywords specified together?

Code

k_pairs = (k_all
    .groupby('Paper ID')
    .apply(lambda g: pd.DataFrame(itertools.combinations(g['Short Name'].values, 2)))
    .join(submissions['Decision'])
)

tmp = k_pairs.groupby([0,1]).size().nlargest(40)
tmp = (
    k_pairs
    .set_index([0,1])
    .loc[tmp.index]
    .assign(**{'Keyword Pair': lambda df: [' + '.join(v) for v in df.index.values]})
    .value_counts(['Keyword Pair', 'Decision'], sort=False)
    .rename('# Submissions')
    .reset_index()
)

px.bar(tmp,
    x = 'Keyword Pair',
    y = '# Submissions',
    color = 'Decision',
    custom_data = ['Decision'],
).update_layout(
    xaxis_dtick = 1,
    xaxis_categoryorder = 'total descending',
    xaxis_tickfont_size = 8,
    title = 'Top 40 keyword pairs',
    **aspect(0.55)
).update_traces(
    hovertemplate = '%{y} submissions with keyword pair "%{x}" had decision "%{customdata[0]}"<extra></extra>',
).show(config=config)

k_pairs = (k_all
    .groupby('Paper ID')
    .apply(lambda g: pd.DataFrame(itertools.combinations(g['Short Name'].values, 2)))
    .join(submissions['FinalDecision'])
)
tmp = k_pairs.groupby([0,1]).size().nlargest(40)
tmp = (
    k_pairs
    .set_index([0,1])
    .loc[tmp.index]
    .assign(**{'Keyword Pair': lambda df: [' + '.join(v) for v in df.index.values]})
    .value_counts(['Keyword Pair', 'FinalDecision'], sort=False)
    .rename('# Submissions')
    .reset_index()
)

px.bar(tmp,
    x = 'Keyword Pair',
    y = '# Submissions',
    color = 'FinalDecision',
    custom_data = ['FinalDecision'],
).update_layout(
    xaxis_dtick = 1,
    xaxis_categoryorder = 'total descending',
    xaxis_tickfont_size = 8,
    title = 'Top 40 keyword pairs',
    **aspect(0.55)
).update_traces(
    hovertemplate = '%{y} submissions with keyword pair "%{x}" had decision "%{customdata[0]}"<extra></extra>',
).show(config=config)

newtmp = (pd.merge(tmp, tmp
    .groupby('Keyword Pair').sum(), on='Keyword Pair')
    .rename(columns={'# Submissions_x':"# Submissions", "# Submissions_y":"Total"})
    .assign(Percent=lambda x:round(1000 * x['# Submissions']/x.Total)/10)
    .sort_values(by="Percent")
)
px.bar(newtmp,
    x = 'Keyword Pair',
    y = 'Percent',
    color = 'FinalDecision',
    custom_data = ['FinalDecision'],
).update_layout(
    xaxis_dtick = 1,
    xaxis_categoryorder = 'total descending',
    xaxis_tickfont_size = 8,
    title = 'Top 40 keyword pairs sorted by acceptance rate',
    **aspect(0.55)
).update_traces(
    hovertemplate = '%{y} submissions with keyword pair "%{x}" had decision "%{customdata[0]}"<extra></extra>',
).show(config=config)

cooc = (k_pairs
    .groupby([0,1])
    .size()
    .unstack()
    .reindex(
        index = staticdata['keywords']['Short Name'], 
        columns = staticdata['keywords']['Short Name']
    )
    .fillna(0)
)



cooc = (cooc + cooc.T)
cooc['Total'] = cooc.max(axis=0)
cooc.sort_values(by='Total', ascending=False,inplace=True)
cooc = cooc.loc[:,list(cooc.index)]

np.fill_diagonal(cooc.values, None)

px.imshow(cooc, 
    color_continuous_scale='portland',
).update_traces(
    connectgaps = False,
    hoverongaps = False,
    hovertemplate = "Keywords '%{x}' and '%{y}' are jointly specified in %{z} submissions<extra></extra>",
    colorbar_title = '# Submissions',
).update_layout(    
    xaxis_dtick = 1,
    xaxis_tickfont_size = 7,
    yaxis_dtick = 1,
    yaxis_tickfont_size = 7,
    hovermode = 'closest',
    xaxis_showgrid = False,
    yaxis_showgrid = False,
    title = 'Co-occurrence of keywords',
    **aspect(.7)
).show(config=config)

Code

cooc = (k_pairs
    .groupby([0,1])
    .size()
    .unstack()
    .reindex(
        index = staticdata['keywords']['Short Name'], 
        columns = staticdata['keywords']['Short Name']
    )
    .fillna(0)
)
cooc['Total'] = cooc.max(axis=1)
cooc.sort_values(by='Total', ascending=False,inplace=True)

k_triples = (k_all
    .groupby('Paper ID')
    .apply(lambda g: pd.DataFrame(itertools.combinations(g['Short Name'].values, 3)))
    .join(submissions['FinalDecision'])
)

tmp = k_triples.groupby([0,1,2]).size().nlargest(40)
tmp = (
    k_triples
    .set_index([0,1,2])
    .loc[tmp.index]
    .assign(**{'Keyword Triple': lambda df: [' + '.join(v) for v in df.index.values]})
)
tmp= (
    pd.crosstab(tmp['Keyword Triple'], tmp['FinalDecision']).stack() 
    .groupby(level = 0)
    .apply(group_stat)
    .reset_index()
    .sort_values('Total', ascending=False)
)

px.bar(tmp,
    x = 'Keyword Triple',
    y = '# Submissions',
    color = 'FinalDecision',
).update_layout(
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    title = 'Top 40 keyword triples',
    **aspect(0.5)
).update_traces(
    hovertemplate = '%{y} submissions with keyword pair "%{x}" had decision "%{fullData.name}"<extra></extra>',
).show(config=config)


px.bar(tmp,
    x = 'Keyword Triple',
    y = '% Submissions',
    color = 'FinalDecision',
).update_layout(
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    title = 'Top 40 keyword triples',
    **aspect(0.5)
).update_traces(
    hovertemplate = '%{y}% submissions with keyword pair "%{x}" had decision "%{fullData.name}"<extra></extra>',
).show(config=config)

Matching for reviewers

What is the distribution of match scores by keyword?

Code

tmp = (matchscores.T
    .stack()
    .rename('Score')
    .loc[lambda x: x > -1.0]
    .reset_index()
    .merge(k_all.loc[:,['Paper ID', 'Short Name', 'Category']], on='Paper ID')
    #.merge(k_all.loc[:,['sid', 'Short Name', 'Category']], on='sid')
    .reset_index()
)

px.box(tmp,
    x = 'Short Name',
    y = 'Score',
    color = 'Category',
).update_layout(
    xaxis_dtick = 1,
    xaxis_tickfont_size = 8,
    title = 'Distribution of match scores per keyword',
    **aspect(0.4)
).update_traces(
    width = .5,
    line_width = 1,
).show(config=config)

What is the number of “high” match scores, per submission?

Code

tmp = (matchscores.T
    .where(lambda x: x > -1.0, None)
)

threshold = [0.5, 0.7, 0.9]

tmp = pd.concat([ 
    (tmp >= q).agg('sum', axis=1).rename('≥ %.1f' % q) 
    for q in threshold 
], axis=1)

px.bar(tmp,
    barmode = 'overlay',
    opacity = 1,
).update_layout(
    legend_title = 'Match Score',
    bargap = 0.1,
    xaxis_type = 'category',
    xaxis_tickfont_size = 8,
    yaxis_title = '# Reviewers',
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Submission %{x} has %{y} matches %{fullData.name}<extra></extra>'
).show(config=config)

Which are the 10 submissions with the lowest number of “good” (≥ 0.5) match scores?

Code

ind = tmp['≥ 0.5'].nsmallest(10).index

(tmp
    .merge(bids
        .query('`Paper ID` in @ind and Bid in ["willing", "want"]')
        .value_counts(['Paper ID'])
        .rename("Pos. Bids"),
        on='Paper ID')
    .merge(submissions, on='Paper ID')
    .sort_values('≥ 0.5')
    .assign(Keywords = lambda df: df.Keywords
        .apply(lambda x: ', '.join(
            [staticdata['keywords'].set_index('Keyword').loc[k, 'Short Name'] for k in x]
    )))
)

	≥ 0.5	≥ 0.7	≥ 0.9	Pos. Bids	sid	Old Paper ID	Decision	year	Area	Keywords	# Keywords	FinalDecision
Paper ID
873	0	0	0	14	873	287	Reject	2021	Theoretical & Empirical	OtherContrib, OtherTopic	2	Reject
1139	0	0	0	16	1139	111	Accept	2022	Theoretical & Empirical	NAData, OtherContrib	2	Accept
844	50	31	11	10	844	258	Confer vs. cond Accept	2021	Data Transformations	Vector_Tensor, CompTop, Flow	3	Accept
905	52	28	28	27	905	319	Reject	2021	Data Transformations	Flow	1	Reject
1441	54	32	13	14	1441	413	Reject	2022	Data Transformations	Vector_Tensor, CompTop, Flow	3	Reject
1971	54	29	29	12	1971	483	Confer vs. cond Accept	2023	Systems & Rendering	Flow	1	Accept
1494	56	27	27	16	1494	6	Reject	2023	Theoretical & Empirical	Vector_Tensor	1	Reject
1587	56	27	27	18	1587	99	Reject	2023	Theoretical & Empirical	Vector_Tensor	1	Reject
1143	62	34	25	18	1143	115	Reject	2022	Theoretical & Empirical	Vector_Tensor, Flow	2	Reject
1325	62	11	11	4	1325	297	Reject	2022	Applications	CompSystems	1	Reject

What is the number of “high” match scores, per keyword?

Code

tmp = (matchscores.T
    .where(lambda x: x > -1.0, None)
)

tmp = (k_all
    .set_index('Paper ID')['Short Name']
    .to_frame()
    .merge(tmp, left_index=True, right_index=True, how='inner')
    .set_index('Short Name')
)

tmp = (pd
    .concat([ 
        (tmp >= q).agg('sum', axis=1).rename('≥ %.1f' % q) 
        for q in threshold 
    ], axis=1)
    .groupby('Short Name')
    .mean()
)

px.bar(tmp,
    barmode = 'overlay',
    opacity = 1,
).update_layout(
    legend_title = 'Match Score',
    bargap = 0.1,
    xaxis_dtick = 1,
    xaxis_type = 'category',
    xaxis_tickfont_size = 8,
    yaxis_title = '# Matches',
    title = 'Match scores by keyword',
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Keyword %{x} has %{y:.1f} matches %{fullData.name}<extra></extra>'
).show(config=config)



px.bar(tmp.sort_values('≥ 0.9', ascending=False),
    barmode = 'overlay',
    opacity = 1,
).update_layout(
    legend_title = 'Match Score',
    bargap = 0.1,
    xaxis_dtick = 1,
    xaxis_type = 'category',
    xaxis_tickfont_size = 8,
    yaxis_title = '# Matches',
    title = 'Match scores by keyword',
    xaxis_categoryorder = "trace",
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Keyword %{x} has %{y:.1f} matches %{fullData.name}<extra></extra>'   
).show(config=config)


px.bar(tmp.sort_values('≥ 0.7', ascending=False),
    barmode = 'overlay',
    opacity = 1,
).update_layout(
    legend_title = 'Match Score',
    bargap = 0.1,
    xaxis_dtick = 1,
    xaxis_type = 'category',
    xaxis_tickfont_size = 8,
    yaxis_title = '# Matches',
    title = 'Match scores by keyword',
    xaxis_categoryorder = "trace",
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Keyword %{x} has %{y:.1f} matches %{fullData.name}<extra></extra>'   
).show(config=config)


px.bar(tmp.sort_values('≥ 0.5', ascending=False),
    barmode = 'overlay',
    opacity = 1,
).update_layout(
    legend_title = 'Match Score',
    bargap = 0.1,
    xaxis_dtick = 1,
    xaxis_type = 'category',
    xaxis_tickfont_size = 8,
    yaxis_title = '# Matches',
    title = 'Match scores by keyword',
    xaxis_categoryorder = "trace",
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Keyword %{x} has %{y:.1f} matches %{fullData.name}<extra></extra>'   
).show(config=config)

Bidding

How many bids did individual PC members make?

Code

tmp = (bids
    .value_counts(['Reviewer', 'Bid'], sort=False)
    .rename('# Bids')
    .reset_index()
)

px.bar(tmp,
    x = 'Reviewer',
    y = '# Bids',
    color = 'Bid'
).update_layout(
    xaxis_type = 'category',
    xaxis_categoryorder = 'total descending',
    xaxis_showticklabels = False,
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Reviewier %{x} made %{y} "%{fullData.name}" bids.<extra></extra>'
).show(config=config)

How many (positive) bids did each submission receive?

Code

tmp = (bids
    .value_counts(['Paper ID', 'Bid'], sort=False)
    #.value_counts(['sid', 'Bid'], sort=False)
    .rename('# Bids')
    .reset_index()
    .loc[lambda x: x.Bid.isin(['want', 'willing'])]
)

px.bar(tmp,
    x = 'Paper ID',
    #x = 'sid',
    y = '# Bids',
    color = 'Bid'
).update_layout(
    xaxis_type = 'category',
    xaxis_categoryorder = 'total descending',
    xaxis_showticklabels = False,
    title = 'Positive Bids per Paper',
    **aspect(0.4),
).update_traces(
    hovertemplate = 'Paper %{x} received %{y} "%{fullData.name}" bids.<extra></extra>',
).show(config=config)

Code

popular = 15

tmp = (bids
    .query('Bid in ["want", "willing"]')
    .value_counts(['Paper ID', 'Bid'], sort=False)
   # .value_counts(['sid', 'Bid'], sort=False)
    .unstack()
    .fillna(0)
    .groupby(['want', 'willing'])
    .apply(lambda g: pd.Series({'ids': g.index.values, 'count': g.index.size}))
    .reset_index()
    .assign(popular = 
            lambda df: np.where( df['willing']+df['want']>=popular, "≥ %d" % popular, "< %d" % popular)
))

px.scatter(tmp,
    x = 'willing',
    y = 'want',
    size = 'count',
    color = 'popular',
    custom_data = ['count', 'ids'],
).update_layout(
    legend_title = 'Total Pos. Bids',
    title = 'Distribution of Positive Bids',
    **aspect(0.4)
).update_traces(
    hovertemplate = '%{customdata[0]} papers received %{x} "willing" and %{y} "want" bids',
).show(config=config)

Does the presence of specific keywords correlate with bidding?

We run a reviewer-independent ridge regression model where the independent variable is the overall reviewer interest, and the dependent variable is the (weighted) presence of a keyword. We measure interest by giving each “willing” or “want” bid a score of 1:

Code

tmp_3 = staticdata['keywords'].copy()
tmp_3['ix'] = list(range(len(tmp_3)))
tmp_3 = tmp_3[['Short Name', 'ix']]
tmp_1 = k_all[['Paper ID', 'Short Name']]
#tmp_1 = k_all[['sid', 'Short Name']]
tmp_2 = bids[(bids['Bid'] == 'willing') | (bids['Bid'] == 'want')]
df = tmp_1.merge(tmp_3, on="Short Name").merge(tmp_2, on="Paper ID")
#df = tmp_1.merge(tmp_3, on="Short Name").merge(tmp_2, on="sid")
df['weight'] = 2
df.loc[df['Bid'] == 'willing', 'weight'] = 1

total_weight = df[['Paper ID', 'ix', 'weight']].groupby(['Paper ID', 'ix']).sum().reset_index()
keyword_count = tmp_1.groupby(['Paper ID']).count().reset_index()
#total_weight = df[['sid', 'ix', 'weight']].groupby(['sid', 'ix']).sum().reset_index()
#keyword_count = tmp_1.groupby(['sid']).count().reset_index()
keyword_count['Keyword Weight'] = 1.0/keyword_count['Short Name']
total_weight = total_weight.merge(keyword_count[['Paper ID', 'Keyword Weight']], on="Paper ID")
nrows = max(total_weight['Paper ID']) + 1
#total_weight = total_weight.merge(keyword_count[['sid', 'Keyword Weight']], on="sid")
#nrows = max(total_weight['sid']) + 1

ncols = max(total_weight['ix']) + 1
design_matrix = np.zeros((nrows, ncols))
design_matrix.shape
rhs = np.zeros(nrows)

# this is embarrassing, there must be a fancy pandas way of doing it.
# someone else can figure it out.
for i, row in total_weight.iterrows():
    design_matrix[int(row['Paper ID']), int(row['ix'])] = row['Keyword Weight']
    #design_matrix[int(row['sid']), int(row['ix'])] = row['Keyword Weight']
    rhs[int(row['Paper ID'])] = row['weight']
    #rhs[int(row['sid'])] = row['weight']
import scipy.linalg
from sklearn.linear_model import Ridge
# Ideally, we find the best regularizer by splitting into training/validation,
# but on inspection the order doesn't seem to change too much 
lr = Ridge(1).fit(design_matrix, rhs)
lr.coef_
tmp_3['Importance'] = lr.coef_
tmp_3 = tmp_3.sort_values(by=['Importance']).merge(staticdata['keywords'], on='Short Name', )

px.scatter(tmp_3, 
    x="Short Name", 
    y="Importance", 
    color='Category',
    custom_data = ['Keyword'],
).update_layout(
    title = 'Keyword Importance for Bidding',
    xaxis_dtick = 1,
    xaxis_categoryorder = 'trace',
    xaxis_tickfont_size = 8,
    **aspect(0.4)
).update_traces(
    hovertemplate = 'Importance of "%{customdata[0]}": %{y}<extra></extra>'
).show(config=config)

Assignment

How many papers were PC members assigned?

Code

tmp = assignments.value_counts(['Reviewer']).rename('# Assignments').reset_index()


px.histogram(tmp,
    x = '# Assignments',
).update_traces(
    hovertemplate = '%{y} reviewers were assigned %{x} submissions',
).update_layout(
    bargap = .1,
    yaxis_title = '# PC members',
    title = 'Distribution of assignments',
    
    **aspect(0.4)
).show(config=config)

### TODO: Split by year 

tmp2=assignments.merge(submissions['year'], on='Paper ID').value_counts(['Reviewer', 'year']).rename('# Assignments').reset_index()

px.histogram(tmp2,
    x = '# Assignments',
    facet_row="year",
    category_orders={'year': [2023, 2022, 2021]}
).update_traces(
    hovertemplate = '%{y} reviewers were assigned %{x} submissions'
).update_layout(
    bargap = .1,
    yaxis1_title = '# PC members',
    yaxis2_title = '# PC members',
    yaxis3_title = '# PC members',
    title = 'Distribution of assignments, per year',
    **aspect(0.5)
).show(config=config)

Code

tmp = assignments.value_counts(['Reviewer', 'Role']).reset_index()

px.histogram(tmp,
    x = 0,
    color = 'Role',
).update_traces(
    hovertemplate = '%{y} reviewers were assigned %{x} submissions as %{fullData.name}<extra></extra>'
).update_layout(
    bargap = .1,
    barmode = 'group',
    xaxis_title = '# Assignments',
    yaxis_title = '# Members',
    title = 'Distribution of assignments',
    **aspect(0.4)
).show(config=config)

### TODO: Split by year

tmp2=assignments.merge(submissions['year'], on='Paper ID').value_counts(['Reviewer', 'Role', 'year']).reset_index()

px.histogram(tmp2,
    x = 0,
    color = 'Role',
    facet_row='year',
    category_orders={'year': [2023, 2022, 2021]}
).update_traces(
    hovertemplate = '%{y} reviewers were assigned %{x} submissions as %{fullData.name}<extra></extra>'
).update_layout(
    bargap = .1,
    barmode = 'group',
    xaxis_title = '# Assignments',
    yaxis1_title = '# Members',
    yaxis2_title = '# Members',
    yaxis3_title = '# Members',
    title = 'Distribution of assignments',
    **aspect(0.5)
).show(config=config)

How many areas did reviewers review in?

Code

tmp = (assignments
    .merge(submissions, on='Paper ID')
    #.merge(submissions, on='sid')
    .groupby('Reviewer')
    .apply(lambda x: len(x['Area'].unique()))
    .reset_index())

px.histogram(tmp,
    x = 0,
).update_traces(
    hovertemplate = '%{y} PC members were assigned submissions from %{x} area(s)',
).update_layout(
    bargap = .1,
    xaxis_title = '# Areas',
    yaxis_title = '# PC members',
    **aspect(0.4),
).show(config=config)

### TODO: Split by year

tmp = (assignments
    .merge(submissions, on='Paper ID')
    .groupby(['Reviewer', 'year'])
    .apply(lambda x: len(x['Area'].unique()))
    .reset_index())

px.histogram(tmp,
    x = 0,
    facet_row='year',
    category_orders={'year': [2023, 2022, 2021]}
).update_traces(
    hovertemplate = '%{y} PC members were assigned submissions from %{x} area(s)',
).update_layout(
    bargap = .1,
    xaxis_title = '# Areas',
    yaxis1_title = '# PC members',
    yaxis2_title = '# PC members',
    yaxis3_title = '# PC members',
    **aspect(0.5),
).show(config=config)

How do match scores correlate with bids?

Code

tmp = bids.assign(
    Score = bids.apply(lambda x: (matchscores.loc[x['Reviewer'], x['Paper ID']]), axis=1),
    Area  = bids.apply(lambda x: (submissions.loc[x['Paper ID'], 'Area']), axis=1) 
#    Score = bids.apply(lambda x: (matchscores.loc[x['Reviewer'], x['sid']]), axis=1),
#    Area  = bids.apply(lambda x: (submissions.loc[x['sid'], 'Area']), axis=1) 
).query('Score > -1.0')

px.box(tmp,
    x = 'Bid',
    y = 'Score',
    color = 'Bid',
).update_layout(
    showlegend = False,
    xaxis_categoryorder = 'array',
    xaxis_categoryarray = ['want', 'willing', 'reluctant', 'conflict'],
    **aspect(0.4)
).update_traces(
    line_width = 2,
    boxmean = True
).show(config=config)

Code

### TODO: Split by year

tmp = bids.assign(
    Score = bids.apply(lambda x: (matchscores.loc[x['Reviewer'], x['Paper ID']]), axis=1),
    Area  = bids.apply(lambda x: (submissions.loc[x['Paper ID'], 'Area']), axis=1), 
    year  = bids.apply(lambda x: (submissions.loc[x['Paper ID'], 'year']), axis=1), 
).query('Score > -1.0')

px.box(tmp,
    x = 'Bid',
    y = 'Score',
    color = 'Bid',
    facet_row='year'
).update_layout(
    showlegend = False,
    xaxis_categoryorder = 'array',
    xaxis_categoryarray = ['want', 'willing', 'reluctant', 'conflict'],
    **aspect(0.8)
).update_traces(
    line_width = 2,
    boxmean = True
).show(config=config)

Code

px.violin(tmp,
    x = 'Bid',
    y = 'Score',
    color = 'Area',
    box = True,
).update_layout(
    # showlegend = False,
    title = 'Match scores by bid by area',
    xaxis_categoryorder = 'array',
    xaxis_categoryarray = ['want', 'willing', 'reluctant', 'conflict'],
    violingap=0.2, 
    violingroupgap=0.1,
    **aspect(0.4)
).update_traces(
    box_line_color = 'black',
    box_line_width = 1,
    line_width = 0,
    meanline_visible=True,
    marker_size = 4,
    # boxpoints = 'outliers',
).show(config=config)

### TODO: Split by year

px.violin(tmp,
    x = 'Bid',
    y = 'Score',
    color = 'Area',
    box = True,
    facet_row='year'
).update_layout(
    # showlegend = False,
    title = 'Match scores by bid by area, by year',
    xaxis_categoryorder = 'array',
    xaxis_categoryarray = ['want', 'willing', 'reluctant', 'conflict'],
    violingap=0.2, 
    violingroupgap=0.1,
    **aspect(0.8)
).update_traces(
    box_line_color = 'black',
    box_line_width = 1,
    line_width = 0,
    meanline_visible=True,
    marker_size = 4,
    # boxpoints = 'outliers',
).show(config=config)

How often were reviewers assigned submissions that they bid on?

Code

tmp = (
    assignments
        .merge(bids, on=['Reviewer', 'Paper ID'], how='left')
#        .merge(bids, on=['Reviewer', 'sid'], how='left')
        .value_counts(['Role', 'Bid'])
        .rename('Reviewers')
        .reset_index()
)

fig = px.bar(tmp,
    y = 'Reviewers',
    x = 'Role',
    color = 'Bid',
    custom_data = ['Bid']
).update_traces(
    hovertemplate = '%{y} PC members assigned as %{x} bid %{customdata}<extra></extra>',
).update_layout(
    title = "Assignment by bidding",
    **aspect(0.4),
).show(config=config)

### TODO: Split by year
tmp2 = (
    assignments
        .merge(bids, on=['Reviewer', 'Paper ID'], how='left')
        .merge(submissions, on='Paper ID')
        .value_counts(['Role', 'Bid', 'year'])
        .rename('Reviewers')
        .reset_index()
)

fig = px.bar(tmp2,
    y = 'Reviewers',
    x = 'Role',
    color = 'Bid',
    custom_data = ['Bid'],
    facet_row='year'  
).update_traces(
    hovertemplate = '%{y} PC members assigned as %{x} bid %{customdata}<extra></extra>',
).update_layout(
    title = "Assignment by bidding, per year",
    **aspect(0.5),
).show(config=config)