Desafio 3

Author

Elisa Amorim Da Costa

JSON

#O arquivo, heart_disease_dataset.json, contém um conjunto de dados anônimos de pacientes, com diversas variáveis clínicas (idade, sexo, colesterol, etc.) utilizadas para prever a presença ou ausência de doenças cardíacas

#Para carregar o arquivo JSON, utiliza-se o pacote jsonlite. Em seguida, a função fromJSON() lê o arquivo e o converte diretamente em um data frame no R.

library(jsonlite)
library(arrow)
library(dplyr)

#`jsonlite`: pacote para trabalhar com JSON
#`arrow`: biblioteca padrão para manipulação de arquivos Parquet no R

#Arquivo JSON sobre doenças cardíacas

unzip("archive.zip")


dados_coracao <- fromJSON("heart_disease_dataset.json")

# visualizar as primeiras linhas do data frame
head(dados_coracao)

  age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
1  67   1  2      111  536   0       2      88     0     1.3     3  2    3
2  57   1  3      109  107   0       2     119     0     5.4     2  0    3
3  43   1  4      171  508   0       1     113     0     3.7     3  0    7
4  71   0  4       90  523   0       2     152     0     4.7     2  1    3
5  36   1  2      119  131   0       2     128     0     5.9     3  1    3
6  49   1  1      186  571   0       0     176     0     4.0     3  0    3
  smoking diabetes  bmi heart_disease
1       1        0 23.4             1
2       0        1 35.4             0
3       1        1 29.9             0
4       1        0 15.2             1
5       1        0 16.7             1
6       1        0 33.8             0

# exibir a estrutura do data frame
str(dados_coracao)

'data.frame':   3069 obs. of  17 variables:
 $ age          : int  67 57 43 71 36 49 67 47 51 39 ...
 $ sex          : int  1 1 1 0 1 1 1 1 0 0 ...
 $ cp           : int  2 3 4 4 2 1 1 2 4 2 ...
 $ trestbps     : int  111 109 171 90 119 186 113 103 125 158 ...
 $ chol         : int  536 107 508 523 131 571 127 305 592 486 ...
 $ fbs          : int  0 0 0 0 0 0 1 0 1 0 ...
 $ restecg      : int  2 2 1 2 2 0 0 0 0 0 ...
 $ thalach      : int  88 119 113 152 128 176 68 185 136 69 ...
 $ exang        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ oldpeak      : num  1.3 5.4 3.7 4.7 5.9 4 6.1 1.9 1.2 2.5 ...
 $ slope        : int  3 2 3 2 3 3 2 2 3 1 ...
 $ ca           : int  2 0 0 1 1 0 3 0 0 0 ...
 $ thal         : int  3 3 7 3 3 3 3 3 6 3 ...
 $ smoking      : int  1 0 1 1 1 1 1 1 0 0 ...
 $ diabetes     : int  0 1 1 0 0 0 0 0 0 0 ...
 $ bmi          : num  23.4 35.4 29.9 15.2 16.7 33.8 26.2 25.1 18.6 18.9 ...
 $ heart_disease: int  1 0 0 1 1 0 0 0 0 0 ...

Parquet

#O arquivo fortune1000_companies.parquet, contém informações sobre as 1000 empresas mais ricas, listadas pela Fortune. O formato Parquet é colunar, otimizado para performance e armazenamento eficiente, muito comum em ambientes de Big Data.

#Para ler arquivos Parquet, utiliza-se o pacote arrow. O código mostra a descompactação do o arquivo .zip somente quando necessário. A função read_parquet() é responsável por ler o arquivo e carregá-lo como um data frame.

#Arquivo sobre as 1000 empresasncom maiores fortunas

arquivo_parquet <- "fortune1000_companies.parquet"


# verificar se o arquivo .parquet ainda não existe na pasta

if (!file.exists(arquivo_parquet)) {
  # se não existir, imprime uma mensagem e descompacta o arquivo .zip
  print("Arquivo não encontrado. Descompactando 'archive_fortune.zip'...")
  unzip("archive_fortune.zip")
} else {
  # se o arquivo já existir, apenas avisa que está pulando a etapa
  print("Arquivo já existe. Pulando a descompactação.")
}

[1] "Arquivo já existe. Pulando a descompactação."

dados_fortune <- read_parquet(arquivo_parquet)

head(dados_fortune)

# A tibble: 6 × 25
  Company     Revenue Percent Chan…¹ Profits Percent Chan…² `Assets ($M)` Sector
  <chr>       <chr>                  <chr>                  <chr>         <chr> 
1 Walmart     6%                     "32.8%"                $252,399      Retai…
2 Amazon      11.8%                  ""                     $527,854      Retai…
3 Apple       -2.8%                  "-2.8%"                $352,583      Techn…
4 UnitedHeal… 14.6%                  "11.2%"                $273,720      Healt…
5 Berkshire … 20.7%                  ""                     $1,069,978    Finan…
6 CVS Health  10.9%                  "101.1%"               $249,728      Healt…
# ℹ abbreviated names: ¹`Revenue Percent Change`, ²`Profits Percent Change`
# ℹ 20 more variables: `Headquarters City` <chr>, State <chr>,
#   `Newcomer to the Fortune 500` <chr>, Profitable <chr>,
#   `Founder is CEO` <chr>, `Female CEO` <chr>, `Growth in Jobs` <chr>,
#   `World's Most Admired Companies` <chr>, `Best Companies` <chr>,
#   `Global 500` <chr>, `Change in Rank (500 only)` <chr>,
#   `Dropped in Rank` <chr>, `Gained in Rank` <chr>, …

str(dados_fortune)

tibble [1,000 × 25] (S3: tbl_df/tbl/data.frame)
 $ Company                       : chr [1:1000] "Walmart" "Amazon" "Apple" "UnitedHealth Group" ...
 $ Revenue Percent Change        : chr [1:1000] "6%" "11.8%" "-2.8%" "14.6%" ...
 $ Profits Percent Change        : chr [1:1000] "32.8%" "" "-2.8%" "11.2%" ...
 $ Assets ($M)                   : chr [1:1000] "$252,399" "$527,854" "$352,583" "$273,720" ...
 $ Sector                        : chr [1:1000] "Retailing" "Retailing" "Technology" "Health Care" ...
 $ Headquarters City             : chr [1:1000] "Bentonville" "Seattle" "Cupertino" "Minnetonka" ...
 $ State                         : chr [1:1000] "Arkansas" "Washington" "California" "Minnesota" ...
 $ Newcomer to the Fortune 500   : chr [1:1000] "no" "no" "no" "no" ...
 $ Profitable                    : chr [1:1000] "yes" "yes" "yes" "yes" ...
 $ Founder is CEO                : chr [1:1000] "no" "no" "no" "no" ...
 $ Female CEO                    : chr [1:1000] "no" "no" "no" "no" ...
 $ Growth in Jobs                : chr [1:1000] "no" "no" "no" "yes" ...
 $ World's Most Admired Companies: chr [1:1000] "yes" "yes" "yes" "yes" ...
 $ Best Companies                : chr [1:1000] "yes" "no" "no" "no" ...
 $ Global 500                    : chr [1:1000] "yes" "yes" "yes" "yes" ...
 $ Change in Rank (500 only)     : chr [1:1000] "" "" "1" "1" ...
 $ Dropped in Rank               : chr [1:1000] "no" "no" "no" "no" ...
 $ Gained in Rank                : chr [1:1000] "no" "no" "yes" "yes" ...
 $ Change in Rank (Full 1000)    : chr [1:1000] "" "" "1" "1" ...
 $ Market Value ($M)             : chr [1:1000] "$484,852.8" "$1,873,675.8" "$2,647,973.8" "$456,080.8" ...
 $ Employees                     : chr [1:1000] "2,100,000" "1,525,000" "161,000" "440,000" ...
 $ Industry                      : chr [1:1000] "General Merchandisers" "Internet Services and Retailing" "Computers, Office Equipment" "Health Care: Insurance and Managed Care" ...
 $ Revenues ($M)                 : chr [1:1000] "$648,125" "$574,785" "$383,285" "$371,622" ...
 $ Profits ($M)                  : chr [1:1000] "$15,511" "$30,425" "$96,995" "$22,381" ...
 $ Rank                          : int [1:1000] 1 2 3 4 5 6 7 8 9 10 ...