!apt install python3-tk ghostscript
!pip install camelot-py[cv]
#Download PDF
!wget https://www.mhlw.go.jp/content/11200000/000541154.pdf -O data.pdf
!pip install japanmap
!pip install japanize-matplotlib
import re
import camelot
import pandas as pd
tables = camelot.read_pdf(
"data.pdf", pages="all", split_text=True, strip_text=" \n", line_scale=40
)
dfs = []
for table in tables:
df0 = table.df
df_col = df0.head(2)
#Complement blanks at cell merges
idx = pd.MultiIndex.from_arrays(
df_col.mask(df_col == "").fillna(method="ffill", axis=1).values
)
df1 = df0.iloc[2:].set_axis(idx, axis=1)
df1.set_index(("year", "Name of prefectures"), inplace=True)
df1.index.name = "Name of prefectures"
#Convert from row to column
df2 = df1.unstack().reset_index().set_axis(["year", "Classification", "Prefectures", "Contents"], axis=1)
#Reconstruction by year / prefecture
df3 = df2.pivot(index=["year", "Prefectures"], columns="Classification", values="Contents")
dfs.append(df3)
df = pd.concat(dfs).reset_index()
df.to_csv("data.tsv", sep="\t")
I was able to copy a merged cell using "copy_text" from camelot h: horizontal v: vertical https://camelot-py.readthedocs.io/en/master/user/advanced.html#copy-text-in-spanning-cells
tables = camelot.read_pdf(
"data.pdf",
pages="all",
split_text=True,
strip_text=" \n",
line_scale=40,
copy_text=["h"],
)
dfs = []
for table in tables:
data = table.data
df1 = pd.DataFrame(data[2:], columns=data[:2])
df1.set_index(("year", "Name of prefectures"), inplace=True)
df1.index.name = "Name of prefectures"
df2 = df1.unstack().reset_index().set_axis(["year", "Classification", "Prefectures", "Contents"], axis=1)
df3 = df2.pivot(index=["year", "Prefectures"], columns="Classification", values="Contents")
dfs.append(df3)
df = pd.concat(dfs).reset_index()
df.to_csv("data.tsv", sep="\t")
def era2year(s):
m = re.search("(Heisei|Reiwa)([0-9 yuan]{1,2})year", s)
if m:
era = m.group(1)
nen = m.group(2)
year = 1 if nen == "Former" else int(nen)
if era == "Heisei":
year += 1988
elif era == "Reiwa":
year += 2018
return year
return None
from japanmap import pref_code
#Give the prefecture code from the prefecture name
df["Prefecture code"] = df["Prefectures"].apply(pref_code)
#Convert from era to year
df["Year"] = df["year"].apply(era2year)
#Remove commas and convert to numbers
df["Revised amount(Circle)"] = df["Revised amount(Circle)"].str.replace(",", "").astype(int)
#Sort by prefecture
pv = df.pivot(index=["Prefecture code", "Prefectures"], columns="Year", values="Revised amount(Circle)").droplevel(
"Prefecture code"
)
pv
#Scraping
df = pd.read_html(
"https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/koyou_roudou/roudoukijun/minimumichiran/",
index_col=0,
header=0,
)[0]
df.index = df.index.str.replace("\s", "")
df.rename(columns={"Minimum wage hour [yen]": 2020}, inplace=True)
#Combine with PDF list
pv1 = pd.concat([pv, df[2020]], axis=1)
pv1.to_csv("minimumichiran.csv")
2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
National weighted average | 663 | 664 | 665 | 668 | 673 | 687 | 703 | 713 | 730 | 737 | 749 | 764 | 780 | 798 | 823 | 848 | 874 | 901 | 902 |
Hokkaido | 637 | 637 | 638 | 641 | 644 | 654 | 667 | 678 | 691 | 705 | 719 | 734 | 748 | 764 | 786 | 810 | 835 | 861 | 861 |
Aomori | 605 | 605 | 606 | 608 | 610 | 619 | 630 | 633 | 645 | 647 | 654 | 665 | 679 | 695 | 716 | 738 | 762 | 790 | 793 |
Iwate | 605 | 605 | 606 | 608 | 610 | 619 | 628 | 631 | 644 | 645 | 653 | 665 | 678 | 695 | 716 | 738 | 762 | 790 | 793 |
Miyagi | 617 | 617 | 619 | 623 | 628 | 639 | 653 | 662 | 674 | 675 | 685 | 696 | 710 | 726 | 748 | 772 | 798 | 824 | 825 |
Akita | 605 | 605 | 606 | 608 | 610 | 618 | 629 | 632 | 645 | 647 | 654 | 665 | 679 | 695 | 716 | 738 | 762 | 790 | 792 |
Yamagata | 605 | 606 | 607 | 610 | 613 | 620 | 629 | 631 | 645 | 647 | 654 | 665 | 680 | 696 | 717 | 739 | 763 | 790 | 793 |
Fukushima | 610 | 610 | 611 | 614 | 618 | 629 | 641 | 644 | 657 | 658 | 664 | 675 | 689 | 705 | 726 | 748 | 772 | 798 | 800 |
Ibaraki | 647 | 647 | 648 | 651 | 655 | 665 | 676 | 678 | 690 | 692 | 699 | 713 | 729 | 747 | 771 | 796 | 822 | 849 | 851 |
Tochigi | 648 | 648 | 649 | 652 | 657 | 671 | 683 | 685 | 697 | 700 | 705 | 718 | 733 | 751 | 775 | 800 | 826 | 853 | 854 |
Gunma | 644 | 644 | 645 | 649 | 654 | 664 | 675 | 676 | 688 | 690 | 696 | 707 | 721 | 737 | 759 | 783 | 809 | 835 | 837 |
Saitama | 678 | 678 | 679 | 682 | 687 | 702 | 722 | 735 | 750 | 759 | 771 | 785 | 802 | 820 | 845 | 871 | 898 | 926 | 928 |
Chiba | 677 | 677 | 678 | 682 | 687 | 706 | 723 | 728 | 744 | 748 | 756 | 777 | 798 | 817 | 842 | 868 | 895 | 923 | 925 |
Tokyo | 708 | 708 | 710 | 714 | 719 | 739 | 766 | 791 | 821 | 837 | 850 | 869 | 888 | 907 | 932 | 958 | 985 | 1013 | 1013 |
Kanagawa | 706 | 707 | 708 | 712 | 717 | 736 | 766 | 789 | 818 | 836 | 849 | 868 | 887 | 905 | 930 | 956 | 983 | 1011 | 1012 |
Niigata | 641 | 641 | 642 | 645 | 648 | 657 | 669 | 669 | 681 | 683 | 689 | 701 | 715 | 731 | 753 | 778 | 803 | 830 | 831 |
Toyama | 644 | 644 | 644 | 648 | 652 | 666 | 677 | 679 | 691 | 692 | 700 | 712 | 728 | 746 | 770 | 795 | 821 | 848 | 849 |
Ishikawa | 645 | 645 | 646 | 649 | 652 | 662 | 673 | 674 | 686 | 687 | 693 | 704 | 718 | 735 | 757 | 781 | 806 | 832 | 833 |
Fukui | 642 | 642 | 643 | 645 | 649 | 659 | 670 | 671 | 683 | 684 | 690 | 701 | 716 | 732 | 754 | 778 | 803 | 829 | 830 |
Yamanashi | 647 | 647 | 648 | 651 | 655 | 665 | 676 | 677 | 689 | 690 | 695 | 706 | 721 | 737 | 759 | 784 | 810 | 837 | 838 |
Nagano | 646 | 646 | 647 | 650 | 655 | 669 | 680 | 681 | 693 | 694 | 700 | 713 | 728 | 746 | 770 | 795 | 821 | 848 | 849 |
Gifu | 668 | 668 | 669 | 671 | 675 | 685 | 696 | 696 | 706 | 707 | 713 | 724 | 738 | 754 | 776 | 800 | 825 | 851 | 852 |
Shizuoka | 671 | 671 | 673 | 677 | 682 | 697 | 711 | 713 | 725 | 728 | 735 | 749 | 765 | 783 | 807 | 832 | 858 | 885 | 885 |
Aichi | 681 | 681 | 683 | 688 | 694 | 714 | 731 | 732 | 745 | 750 | 758 | 780 | 800 | 820 | 845 | 871 | 898 | 926 | 927 |
Triple | 667 | 667 | 668 | 671 | 675 | 689 | 701 | 702 | 714 | 717 | 724 | 737 | 753 | 771 | 795 | 820 | 846 | 873 | 874 |
Shiga | 651 | 651 | 652 | 657 | 662 | 677 | 691 | 693 | 706 | 709 | 716 | 730 | 746 | 764 | 788 | 813 | 839 | 866 | 868 |
Kyoto | 677 | 677 | 678 | 682 | 686 | 700 | 717 | 729 | 749 | 751 | 759 | 773 | 789 | 807 | 831 | 856 | 882 | 909 | 909 |
Osaka | 703 | 703 | 704 | 708 | 712 | 731 | 748 | 762 | 779 | 786 | 800 | 819 | 838 | 858 | 883 | 909 | 936 | 964 | 964 |
Hyogo | 675 | 675 | 676 | 679 | 683 | 697 | 712 | 721 | 734 | 739 | 749 | 761 | 776 | 794 | 819 | 844 | 871 | 899 | 900 |
Nara | 647 | 647 | 648 | 652 | 656 | 667 | 678 | 679 | 691 | 693 | 699 | 710 | 724 | 740 | 762 | 786 | 811 | 837 | 838 |
Wakayama | 645 | 645 | 645 | 649 | 652 | 662 | 673 | 674 | 684 | 685 | 690 | 701 | 715 | 731 | 753 | 777 | 803 | 830 | 831 |
Tottori | 610 | 610 | 611 | 612 | 614 | 621 | 629 | 630 | 642 | 646 | 653 | 664 | 677 | 693 | 715 | 738 | 762 | 790 | 792 |
Shimane | 609 | 609 | 610 | 612 | 614 | 621 | 629 | 630 | 642 | 646 | 652 | 664 | 679 | 696 | 718 | 740 | 764 | 790 | 792 |
Okayama | 640 | 640 | 641 | 644 | 648 | 658 | 669 | 670 | 683 | 685 | 691 | 703 | 719 | 735 | 757 | 781 | 807 | 833 | 834 |
Hiroshima | 644 | 644 | 645 | 649 | 654 | 669 | 683 | 692 | 704 | 710 | 719 | 733 | 750 | 769 | 793 | 818 | 844 | 871 | 871 |
Yamaguchi | 637 | 637 | 638 | 642 | 646 | 657 | 668 | 669 | 681 | 684 | 690 | 701 | 715 | 731 | 753 | 777 | 802 | 829 | 829 |
Tokushima | 611 | 611 | 612 | 615 | 617 | 625 | 632 | 633 | 645 | 647 | 654 | 666 | 679 | 695 | 716 | 740 | 766 | 793 | 796 |
Kagawa | 618 | 619 | 620 | 625 | 629 | 640 | 651 | 652 | 664 | 667 | 674 | 686 | 702 | 719 | 742 | 766 | 792 | 818 | 820 |
Ehime | 611 | 611 | 612 | 614 | 616 | 623 | 631 | 632 | 644 | 647 | 654 | 666 | 680 | 696 | 717 | 739 | 764 | 790 | 793 |
Kochi | 611 | 611 | 611 | 613 | 615 | 622 | 630 | 631 | 642 | 645 | 652 | 664 | 677 | 693 | 715 | 737 | 762 | 790 | 792 |
Fukuoka | 643 | 644 | 645 | 648 | 652 | 663 | 675 | 680 | 692 | 695 | 701 | 712 | 727 | 743 | 765 | 789 | 814 | 841 | 842 |
Saga | 605 | 605 | 606 | 608 | 611 | 619 | 628 | 629 | 642 | 646 | 653 | 664 | 678 | 694 | 715 | 737 | 762 | 790 | 792 |
Nagasaki | 605 | 605 | 606 | 608 | 611 | 619 | 628 | 629 | 642 | 646 | 653 | 664 | 677 | 694 | 715 | 737 | 762 | 790 | 793 |
Kumamoto | 606 | 606 | 607 | 609 | 612 | 620 | 628 | 630 | 643 | 647 | 653 | 664 | 677 | 694 | 715 | 737 | 762 | 790 | 793 |
Oita | 606 | 606 | 607 | 610 | 613 | 620 | 630 | 631 | 643 | 647 | 653 | 664 | 677 | 694 | 715 | 737 | 762 | 790 | 792 |
Miyazaki | 605 | 605 | 606 | 608 | 611 | 619 | 627 | 629 | 642 | 646 | 653 | 664 | 677 | 693 | 714 | 737 | 762 | 790 | 793 |
Kagoshima | 605 | 605 | 606 | 608 | 611 | 619 | 627 | 630 | 642 | 647 | 654 | 665 | 678 | 694 | 715 | 737 | 761 | 790 | 793 |
Okinawa | 604 | 605 | 606 | 608 | 610 | 618 | 627 | 629 | 642 | 645 | 653 | 664 | 677 | 693 | 714 | 737 | 762 | 790 | 792 |
import japanize_matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
#resolution
mpl.rcParams["figure.dpi"] = 200
pv1.T.plot(subplots=True, layout=(7, 7), figsize=(30, 30), ylim=(600, 1100))
#Save graph
plt.savefig("minimumichiran.png ", dpi=200, bbox_inches="tight")
plt.show()