FaNoOp - H1B Visas Insights Story

The H1B visa program enables U.S. companies to hire foreign workers in specialized fields, such as technology, with visas granted via an annual lottery. A recent Bloomberg News investigation exposed how some companies exploited the lottery system, submitting multiple applications to increase their chances. The dataset used in this analysis, covering fiscal years 2021-2024, was obtained from U.S. Citizenship and Immigration Services (USCIS) through a Freedom of Information Act (FOIA) request. The detailed dataset, including information on employers, job titles, salaries, and visa petitions, can be accessed on GitHub.

Almost 60% applicants are from India, and as expected in the lottery approx 60% of all the H1B Visas go to Indians.

States in USA where the selected H1B visa applicants went to show us that most of the selected visa applicants went on to work in California or Texas.

Popular Cities as destinations for H1B selected Visa applicants are New York City, followed by Seattle.

Occupations Sectors related to Computers and IT dominate the field of study for successful H1B Visa Applicants.

Figure 1: Most of the applicants are from India, and naturally most of the visas go to Indians.

Figure 2: A map of USA with number of successful visa applicants working in each state.

Figure 3: Most popular cities for worksites of successful H1B Visa applicants

Figure 4: The top 150 fields of study for the successful H1B Selected VISA Applicants

This article and analysis is a work in progress, and new material will be added soon.

The code used to prepare these graphics is given below.

Code

# Data Import and Wrangling Tools
library(tidyverse)            # All things tidy

# Final plot tools
library(scales)               # Nice Scales for ggplot2
library(fontawesome)          # Icons display in ggplot2
library(ggtext)               # Markdown text support for ggplot2
library(showtext)             # Display fonts in ggplot2
library(patchwork)            # Combining plots
library(magick)               # Dealing with images

library(httr)                 # Download files
library(zip)                  # Handle ZIP files
library(countrycode)          # To get Country Codes
library(sf)                   # Mapping

# Getting basic fonts etc. for the entire article
font_add_google("Ubuntu Condensed", "caption_font")
font_add_google("Ubuntu", "body_font")
showtext_auto()
text_col <- "grey10"
bg_col = "white"

Code

# Get Data Dictionary used by Bloomberg
dictionary <- openxlsx::read.xlsx(
  "https://github.com/BloombergGraphics/2024-h1b-immigration-data/raw/refs/heads/main/TRK_13139_I129_H1B_Registrations_FY21_FY24_FOIA_FIN.xlsx",
  sheet = "Data Dictionary",
  rows = 1:57,
  cols = 1:2
) |> 
  janitor::clean_names()


# Single registrations data
# Download the ZIP file from the URL
url <- "https://github.com/BloombergGraphics/2024-h1b-immigration-data/blob/main/TRK_13139_FY2024_single_reg.zip?raw=true"
temp_zip <- tempfile(fileext = ".zip")
GET(url, write_disk(temp_zip, overwrite = TRUE))
# Unzip the file to a temporary directory
temp_dir <- tempdir()
unzip(temp_zip, exdir = temp_dir)
# Read the CSV file into R
csv_file <- file.path(temp_dir, "TRK_13139_FY2024_single_reg.csv")
rawdf_single <- read_csv(csv_file) |> 
  janitor::clean_names()

# Multiple registrations data
url1 <- "https://github.com/BloombergGraphics/2024-h1b-immigration-data/blob/main/TRK_13139_FY2024_multi_reg.zip?raw=true"
temp_zip <- tempfile(fileext = ".zip")
GET(url1, write_disk(temp_zip, overwrite = TRUE))
# Unzip the file to a temporary directory
temp_dir <- tempdir()
unzip(temp_zip, exdir = temp_dir)
# Read the CSV file into R
csv_file <- file.path(temp_dir, "TRK_13139_FY2024_multi_reg.csv")
rawdf_multi <- read_csv(csv_file) |> 
  janitor::clean_names()

# Clean up temporary files
unlink(temp_zip)      # Delete the temporary zip file

# Remove the temporary directory and its contents
# unlink(temp_dir, recursive = TRUE)  
rm(csv_file, temp_dir, temp_zip, url, url1)

print(object.size(rawdf_single), units = "Mb")
print(object.size(rawdf_multi), units = "Mb")

Code

df1 <- rawdf_single |>
  filter(status_type == "SELECTED") |>
  count(country_of_nationality, sort = T) |>
  mutate(
    country_of_nationality = fct(
      countrycode(
        country_of_nationality,
        "iso3c",
        "country.name"
      )
    ),
    country_of_nationality = fct_lump_n(
      country_of_nationality,
      n = 10, w = n
    )
  ) |> 
  group_by(country_of_nationality) |> 
  summarise(n = sum(n)) |> 
  drop_na() |> 
  mutate(
    perc = n / sum(n),
    facet_var = "H1B Visa: Selected and Granted"
  )

df2 <- rawdf_single |> 
  count(country_of_nationality, sort = T) |> 
  mutate(
    country_of_nationality = fct(
      countrycode(
        country_of_nationality,
        "iso3c",
        "country.name"
      )
    ),
    country_of_nationality = fct_lump_n(
      country_of_nationality,
      n = 10, w = n
    )
  ) |> 
  group_by(country_of_nationality) |> 
  summarise(n = sum(n)) |> 
  drop_na() |> 
  mutate(
    perc = n / sum(n),
    facet_var = "Total applicants for H1B Visa"
  )

bind_rows(
  df1, df2
) |> 
  mutate(
    facet_var = fct(
      facet_var,
      levels = c(
        unique(df2$facet_var),
        unique(df1$facet_var)
      )
    )
  ) |> 
  
  # Start Plot
  ggplot(
    mapping = aes(
      x = 1,
      y = n,
      fill = country_of_nationality
    )
  ) +
  
  # Bar / Pie
  geom_col(
    colour = "white",
    width = 0.5,
    linewidth = 0.2
  ) +
  
  # Labels for the largest countries
  geom_text(
    mapping = aes(
      x = 1.5,
      label = paste0(
        country_of_nationality,
        "\n(",
        number(n),
        ")\n",
        percent(perc, accuracy = 0.1)
      )
    ),
    position = position_stack(vjust = 0.5),
    check_overlap = TRUE,
    family = "body_font",
    colour = text_col,
    lineheight = 0.3
  ) +
  scale_fill_manual(
    values = c(
      "#E31A1CFF",
      paletteer::paletteer_d("rcartocolor::Pastel"),
      rep("grey50", 10)
    )
  ) +
  coord_radial(
    theta = "y",
    inner.radius = 0.5,
    expand = FALSE
  ) +
  
  facet_wrap(~facet_var, scales = "free") +
  theme_void(
    base_family = "body_font",
    base_size = 14
  ) +
  theme(
    legend.position = "none",
    strip.text = element_text(
      size = 25,
      face = "bold"
    )
  )

ggsave(
  filename = here::here("posts", "h1b_visas", "fig-1.png"),
  device = "png",
  height = 700, width = 1200, units = "px", dpi = 320,
  bg = bg_col
)

image_read(here::here("posts", "h1b_visas", "fig-1.png"))

Code

plotdf2 <- usmapdata::us_map() |> 
  left_join(
    rawdf_single |> 
      filter(status_type == "SELECTED") |> 
      count(worksite_state, sort = T) |> 
      drop_na() |> 
      rename(abbr = worksite_state)
  )

plotdf3 <- plotdf2 |> 
  st_centroid() |> 
  mutate(
    lon = st_coordinates(geom)[, 1], # Extract longitude
    lat = st_coordinates(geom)[, 2]  # Extract latitude
    ) |> 
  mutate(
    label_var = paste0(
      number(n, big.mark = ","),
      "\n(", full, ")"
    )
  ) |> 
  arrange(desc(n))

g <- ggplot() +
  geom_sf(
    data = plotdf2,
    aes(fill = n),
    linewidth = 0.2,
    colour = bg_col
  ) +
  geom_text(
    data = plotdf3,
    aes(
      x = lon, y = lat,
      label = label_var, 
      size = n),
    family = "body_font",
    fontface = "bold",
    check_overlap = T,
    lineheight = 0.25,
    colour = text_col
  ) +
  paletteer::scale_fill_paletteer_c(
    "grDevices::PinkYl",
    direction = -1,
    labels = label_number(),
    breaks = seq(0, 16000, 4000)
  ) +
  scale_size(
    range = c(3, 6)
  ) +
  guides(
    size = "none"
  ) +
  labs(
    fill = "Number of selected H1B Visa applicants for worksite in the state",
  ) +
  ggthemes::theme_map(
    base_size = 20,
    base_family = "body_font"
  ) +
  theme(
    legend.position = "bottom",
    legend.key.width = unit(10, "mm"),
    legend.key.height = unit(1, "mm"),
    plot.margin = margin(-20,0,-20,0, "mm"),
    legend.margin = margin(-10,0,0,0, "mm"),
    legend.title = element_text(
      family = "caption_font",
      margin = margin(0,0,2,0, "mm"),
      hjust = 0.5
    ),
    legend.title.position = "top",
    legend.text = element_text(
      margin = margin(1,0,0,0, "mm")
    ),
    legend.justification = c(0.5, 1)
  )

ggsave(
  filename = here::here("posts", "h1b_visas", "fig-2.png"),
  device = "png",
  height = 900, width = 1200, units = "px", dpi = 320,
  bg = bg_col
)

image_read(here::here("posts", "h1b_visas", "fig-2.png"))

Code

g <- rawdf_single |> 
  count(worksite_state, worksite_city, sort = T) |> 
  drop_na() |> 
  mutate(worksite_city = str_to_title(worksite_city)) |> 
  left_join(
    usmapdata::us_map(regions = "state") |> 
      as_tibble() |> 
      select(abbr, full) |> 
      rename(worksite_state = abbr)
  ) |> 
  slice_max(order_by = n, n = 15) |> 
  ggplot(
    mapping = aes(
      y = reorder(worksite_city, n),
      x = n
    )
  ) + 
  geom_col(
    fill = "#F3946FFF",
    colour = "transparent"
  ) +
  geom_text(
    mapping = aes(
      label = paste0(worksite_city, ", ", full),
      x = 10
    ),
    hjust = 0,
    family = "caption_font"
  ) +
  geom_text(
    mapping = aes(
      label = number(n, big.mark = ",")
    ),
    nudge_x = 50,
    hjust = 0,
    colour = text_col,
    family = "body_font",
    fontface = "bold"
  ) +
  coord_cartesian(clip = "off") +
  theme_void(
    base_size = 20,
    base_family = "body_font"
  ) +
  scale_x_continuous(
    breaks = seq(0, 5000, 1000),
    labels = seq(0, 5000, 1000),
    expand = expansion(c(0,0.1))
  ) +
  labs(
    x = "Number of successful H1B Visa applicants working in the city"
  ) +
  theme(
    text = element_text(
      colour = text_col
    ),
    axis.line.x = element_line(
      linetype = 3,
      linewidth = 0.2
    ),
    axis.text.x = element_text(
      margin = margin(1,0,0,0, "mm")
    ),
    axis.ticks.x = element_line(
      linetype = 1,
      linewidth = 0.2
    ),
    axis.ticks.length.x = unit(0.5, "mm"),
    axis.title.x = element_text(
      family = "caption_font",
      margin = margin(2,0,1,0, "mm")
    ),
    plot.margin = margin(2,2,2,2, "mm")
  )


ggsave(
  filename = here::here("posts", "h1b_visas", "fig-3.png"),
  device = "png",
  height = 900, width = 900, units = "px", dpi = 320,
  bg = bg_col
)

image_read(here::here("posts", "h1b_visas", "fig-3.png"))

Code

mypal <- paletteer::paletteer_d("khroma::stratigraphy")
mypal <- sample(
  mypal,
  size = length(mypal),
  replace = FALSE)

g <- rawdf_single |> 
  count(ben_pfield_of_study, sort = T) |> 
  drop_na() |> 
  mutate(
    ben_pfield_of_study = str_to_title(ben_pfield_of_study),
    ben_pfield_of_study = fct(ben_pfield_of_study),
    ben_pfield_of_study = fct_lump_n(
      ben_pfield_of_study, 
      w = n, 
      n = 150
    )
  ) |> 
  group_by(ben_pfield_of_study) |> 
  summarise(n = sum(n)) |> 
  filter(ben_pfield_of_study != "Other") |> 
  ggplot(
    mapping = aes(
      area = n,
      fill = ben_pfield_of_study,
      label = str_wrap(paste0(ben_pfield_of_study, " (", n, ")"), 10)
    )
  ) +
  treemapify::geom_treemap(
    start = "topleft",
    alpha = 0.9,
    colour = bg_col
  ) +
  treemapify::geom_treemap_text(
    colour = text_col,
    family = "caption_font",
    place = "centre",
    min.size = 2,
    start = "topleft",
    lineheight = 0.25
  ) +
  scale_fill_manual(values = mypal) +
  theme(
    legend.position = "none"
  )

ggsave(
  filename = here::here("posts", "h1b_visas", "fig-4.png"),
  device = "png",
  height = 1500, width = 1500, units = "px", dpi = 320,
  bg = bg_col
)

image_read(here::here("posts", "h1b_visas", "fig-4.png"))

Code

us_zip_shp <- tigris::zctas(
  cb = TRUE,
  year = 2020
)

plotdf <- rawdf_single |> 
  filter(status_type == "SELECTED") |> 
  count(worksite_zip, sort = T) |> 
  drop_na() |>
  mutate(
    zip_code = str_sub(worksite_zip, start = 1, end = 5)
  ) |> 
  group_by(zip_code) |> 
  summarise(n = sum(n)) |> 
  left_join(
    us_zip_shp,
    by = join_by(
      "zip_code" == "ZCTA5CE20"
    )
  ) |> 
  select(zip_code, n, geometry) |> 
  st_as_sf() |> 
  usmap::usmap_transform()
  
g <- ggplot() +
  geom_sf(
    data = plotdf,
    mapping = aes(
      fill = n
    ),
    colour = "transparent",
    linewidth = 1
  ) +
  paletteer::scale_fill_paletteer_c(
    "grDevices::PuBu",
    direction = -1,
    trans = "log10"
    ) +
  ggthemes::theme_map() +
  theme(
    legend.position = "bottom"
  )