-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetTableDataUsingPowerQuery.vba
More file actions
151 lines (124 loc) · 5.51 KB
/
GetTableDataUsingPowerQuery.vba
File metadata and controls
151 lines (124 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
Function FetchDataFromPDFs(pdfPaths As Variant) As Scripting.Dictionary
Dim wb As Workbook, ws As Worksheet, currWB As Workbook
Dim dict As Scripting.Dictionary ' Add from references to use a dict (Optional)
Set currWB = ThisWorkbook
' Create a new workbook to temporarily store extracted table's data
Set wb = Workbooks.Add
Set ws = wb.Worksheets("Sheet1")
ws.Visible = False
Set dict = New Scripting.Dictionary
Dim pdfPath As Variant, sampleName As String, fileName As String
For Each pdfPath In pdfPaths
fileName = Right(pdfPath, Len(pdfPath) - InStrRev(pdfPath, "\"))
sampleName = Split(fileName)(0)
Set dict(sampleName) = New Scripting.Dictionary
Set dict(sampleName) = LoadDataTables(pdfPath) ' This function Fetches data from pdf
Next pdfPath
wb.Close False
currWB.Activate
Set FetchDataFromPDFs = dict
End Function
Function LoadDataTables(filePath As Variant) As Scripting.Dictionary
Dim Name As String, TableIds As Variant, TableId As String, fileName As String, idx As Long
Dim dict As Scripting.Dictionary
Debug.Print filePath
Set dict = New Scripting.Dictionary ' Store data from pdf into this dict
Name = "Table Query"
TableIds = GetPDFTablesIdList(filePath) ' This function fetches id's of all the tables present in the pdf.
For idx = 0 To UBound(TableIds) - LBound(TableIds) - 1 ' Iterate over each table.
TableId = TableIds(idx)
fileName = Right(filePath, Len(filePath) - InStrRev(filePath, "\"))
' M script code (Power Query)
ActiveWorkbook.Queries.Add Name:=Name, Formula:= _
"let" & _
" Source = Pdf.Tables(File.Contents(""" & filePath & """), [Implementation=""1.3""])," & _
" " & TableId & " = Source{[Id=""" & TableId & """]}[Data]," & _
" #""Promoted Headers"" = Table.PromoteHeaders(" & TableId & ", [PromoteAllScalars=true])," & _
" #""Changed Type"" = Table.TransformColumnTypes(" & _
"#""Promoted Headers"", List.Transform(Table.ColumnNames(#""Promoted Headers""), each {_, type text})" & _
")" & _
"in" & _
" #""Changed Type"""
With ActiveSheet.ListObjects.Add(SourceType:=0, Source:= _
"OLEDB;Provider=Microsoft.Mashup.OleDb.1;Data Source=$Workbook$;Location=""" & Name & """;Extended Properties=""""" _
, Destination:=Range("$A$1")).QueryTable
.CommandType = xlCmdSql
.CommandText = Array("SELECT * FROM [" & Name & "]")
.RowNumbers = False
.FillAdjacentFormulas = False
.PreserveFormatting = True
.RefreshOnFileOpen = False
.BackgroundQuery = True
.RefreshStyle = xlInsertDeleteCells
.SavePassword = False
.SaveData = True
.AdjustColumnWidth = True
.RefreshPeriod = 0
.PreserveColumnInfo = True
.Refresh BackgroundQuery:=False
End With
''' Processing...
Dim tbl As ListObject
Set tbl = ActiveSheet.ListObjects(1) ' This varibale contains the nth table data
Dim i As Long
For i = 0 To tbl.ListRows.count
Dim j As Long
For j = 1 To tbl.ListColumns.count
' Cell value of ith row and jth col.
cellValue = tbl.DataBodyRange(i, j).Value
dict(CStr(i) & " " & CStr(j)) = cellValue
Next j
Next i
' Remove current query
Dim Qus As WorkbookQuery
For Each Qus In ActiveWorkbook.Queries
Qus.Delete
Next
ActiveSheet.Cells.Clear
Next idx
Set LoadDataTables = dict
End Function
Function GetPDFTablesIdList(filePath As Variant) As Variant
Dim fileName As String, Name As String
Name = "Dummy Name"
fileName = Right(filePath, Len(filePath) - InStrRev(filePath, "\"))
ActiveWorkbook.Queries.Add Name:=Name, Formula:= _
"let" & _
" Source = Pdf.Tables(File.Contents(""" & filePath & """), [Implementation=""1.3""])," & _
" #""Filtered Rows"" = Table.SelectRows(Source, each ([Kind] = ""Table""))," & _
" #""Removed Columns"" = Table.RemoveColumns(#""Filtered Rows"",{""Kind"", ""Data""})" & _
"in" & _
" #""Removed Columns"""
With ActiveSheet.ListObjects.Add(SourceType:=0, Source:= _
"OLEDB;Provider=Microsoft.Mashup.OleDb.1;Data Source=$Workbook$;Location=""" & Name & """;Extended Properties=""""" _
, Destination:=Range("$A$1")).QueryTable
.CommandType = xlCmdSql
.CommandText = Array("SELECT * FROM [" & Name & "]")
.RowNumbers = False
.FillAdjacentFormulas = False
.PreserveFormatting = True
.RefreshOnFileOpen = False
.BackgroundQuery = True
.RefreshStyle = xlInsertDeleteCells
.SavePassword = False
.SaveData = True
.AdjustColumnWidth = True
.RefreshPeriod = 0
.PreserveColumnInfo = True
.Refresh BackgroundQuery:=False
End With
Dim tbl As ListObject
Set tbl = ActiveSheet.ListObjects(1)
Dim TableIds() As Variant
ReDim TableIds(tbl.ListRows.count)
For i = 0 To tbl.ListRows.count
TableIds(i) = tbl.DataBodyRange(i + 1, 1).Value
Next i
' Remove current query
Dim Qus As WorkbookQuery
For Each Qus In ActiveWorkbook.Queries
Qus.Delete
Next
ActiveSheet.Cells.Clear
GetPDFTablesIdList = TableIds
End Function