使用Python从Google文档下载电子表格

Download a spreadsheet from Google Docs using Python

给定键和工作表ID(gid)，您能否生成一个Python示例，说明如何下载Google Docs电子表格？我不能

我已经搜索了API的版本1、2和3。我没有运气，我无法弄清楚他们编译过的类似ATOM的供稿API，gdata.docs.service.DocsService._DownloadFile私有方法表示我未经授权，并且我不想自己编写整个Google Login身份验证系统。由于挫败感，我要面对自己。

我有一些电子表格，我想像这样访问它们：

1
2
3
4
5
6
7
8
9

username = '[email protected]'
password = getpass.getpass()

def get_spreadsheet(key, gid=0):
... (help!) ...

for row in get_spreadsheet('5a3c7f7dcee4b4f'):
cell1, cell2, cell3 = row
...

请保存我的脸。

更新1：我尝试了以下操作，但是Download()或Export()的组合似乎无效。 (此处是DocsService的文档)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

import gdata.docs.service
import getpass
import os
import tempfile
import csv

def get_csv(file_path):
return csv.reader(file(file_path).readlines())

def get_spreadsheet(key, gid=0):
gd_client = gdata.docs.service.DocsService()
gd_client.email = '[email protected]'
gd_client.password = getpass.getpass()
gd_client.ssl = False
gd_client.source ="My Fancy Spreadsheet Downloader"
gd_client.ProgrammaticLogin()

file_path = tempfile.mktemp(suffix='.csv')
uri = 'http://docs.google.com/feeds/documents/private/full/%s' % key
try:
entry = gd_client.GetDocumentListEntry(uri)

# XXXX - The following dies with RequestError"Unauthorized"
gd_client.Download(entry, file_path)

return get_csv(file_path)
finally:
try:
os.remove(file_path)
except OSError:
pass

相关讨论

https://github.com/burnash/gspread库是与Google Spreadsheets进行交互的一种更新，更简单的方法，而不是建议使用gdata库的旧答案，该库不仅级别太低，而且也太复杂了。

您还需要创建并下载(以JSON格式)服务帐户密钥：https://console.developers.google.com/apis/credentials/serviceaccountkey

以下是使用方法的示例：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

import csv
import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)

docid ="0zjVQXjJixf-SdGpLKnJtcmQhNjVUTk1hNTRpc0x5b9c"

client = gspread.authorize(credentials)
spreadsheet = client.open_by_key(docid)
for i, worksheet in enumerate(spreadsheet.worksheets()):
filename = docid + '-worksheet' + str(i) + '.csv'
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerows(worksheet.get_all_values())

相关讨论

如果有人遇到这种情况寻求快速解决方案，这是另一个(当前)不依赖gdata客户端库的解决方案：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

#!/usr/bin/python

import re, urllib, urllib2

class Spreadsheet(object):
def __init__(self, key):
super(Spreadsheet, self).__init__()
self.key = key

class Client(object):
def __init__(self, email, password):
super(Client, self).__init__()
self.email = email
self.password = password

def _get_auth_token(self, email, password, source, service):
url ="https://www.google.com/accounts/ClientLogin"
params = {
"Email": email,"Passwd": password,
"service": service,
"accountType":"HOSTED_OR_GOOGLE",
"source": source
}
req = urllib2.Request(url, urllib.urlencode(params))
return re.findall(r"Auth=(.*)", urllib2.urlopen(req).read())[0]

def get_auth_token(self):
source = type(self).__name__
return self._get_auth_token(self.email, self.password, source, service="wise")

def download(self, spreadsheet, gid=0, format="csv"):
url_format ="https://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=%s&exportFormat=%s&gid=%i"
headers = {
"Authorization":"GoogleLogin auth=" + self.get_auth_token(),
"GData-Version":"3.0"
}
req = urllib2.Request(url_format % (spreadsheet.key, format, gid), headers=headers)
return urllib2.urlopen(req)

if __name__ =="__main__":
import getpass
import csv

email ="" # (your email here)
password = getpass.getpass()
spreadsheet_id ="" # (spreadsheet id here)

# Create client and spreadsheet objects
gs = Client(email, password)
ss = Spreadsheet(spreadsheet_id)

# Request a file-like object containing the spreadsheet's contents
csv_file = gs.download(ss)

# Parse as CSV and print the rows
for row in csv.reader(csv_file):
print",".join(row)

相关讨论

您可以尝试使用文档的"导出电子表格"部分中介绍的AuthSub方法。

为电子表格服务获取一个单独的登录令牌，并将其替换为导出。将其添加到get_spreadsheet代码对我有用：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

import gdata.spreadsheet.service

def get_spreadsheet(key, gid=0):
# ...
spreadsheets_client = gdata.spreadsheet.service.SpreadsheetsService()
spreadsheets_client.email = gd_client.email
spreadsheets_client.password = gd_client.password
spreadsheets_client.source ="My Fancy Spreadsheet Downloader"
spreadsheets_client.ProgrammaticLogin()

# ...
entry = gd_client.GetDocumentListEntry(uri)
docs_auth_token = gd_client.GetClientLoginToken()
gd_client.SetClientLoginToken(spreadsheets_client.GetClientLoginToken())
gd_client.Export(entry, file_path)
gd_client.SetClientLoginToken(docs_auth_token) # reset the DocList auth token

注意，我还使用了Export，因为Download似乎只提供PDF文件。

相关讨论

(2016年7月)用当前术语表述："如何使用Python从Google云端硬盘下载CSV或XLSX格式的Google表格？"。 (Google文档现在仅指基于云的文字处理器/文本编辑器，它不提供对Google表格电子表格的访问。)

首先，所有其他答案都已经过时或将会过时，因为它们使用了旧的GData(" Google数据")协议，ClientLogin或AuthSub，而所有这些都已被弃用。对于使用Google Sheets API v3或更早版本的所有代码或库，情况也是如此。

现代的Google API访问使用API??密钥(公共数据)或OAuth2授权(授权数据)进行，主要是通过Google API客户端库(包括用于Python的库)进行。 (不，您不必为了访问API而构建一个完整的身份验证系统...请参阅下面的博客文章。)

要执行OP中/由OP所请求的任务，您需要获得对Google Drive API的授权访问权，也许需要查询要下载的特定表格，然后执行实际的导出操作。由于这可能是常见的操作，因此我写了一篇博客文章，分享了一个为您完成此操作的代码段。如果您想进一步追求这一点，我还有另外两对帖子以及一个视频，概述了如何将文件上传到Google云端硬盘和从Google云端硬盘下载文件。

请注意，还有一个更新的Google Sheets API v4，但是它主要用于面向电子表格的操作，即插入数据，读取电子表格行，单元格格式，创建图表，添加数据透视表等，而不是基于文件的请求(例如导出) Drive API是正确使用的API。

我写了一篇博客文章，演示了如何从云端硬盘导出Google表格以CSV格式导出。脚本的核心部分：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

# setup
FILENAME = 'inventory'
SRC_MIMETYPE = 'application/vnd.google-apps.spreadsheet'
DST_MIMETYPE = 'text/csv'
DRIVE = discovery.build('drive', 'v3', http=creds.authorize(Http()))

# query for file to export
files = DRIVE.files().list(
q='name="%s" and mimeType="%s"' % (FILENAME, SRC_MIMETYPE), orderBy='modifiedTime desc,name').execute().get('files', [])

# export 1st match (if found)
if files:
fn = '%s.csv' % os.path.splitext(files[0]['name'].replace(' ', '_'))[0]
print('Exporting"%s" as"%s"... ' % (files[0]['name'], fn), end='')
data = DRIVE.files().export(fileId=files[0]['id'], mimeType=DST_MIMETYPE).execute()
if data:
with open(fn, 'wb') as f:
f.write(data)
print('DONE')

要了解有关将Google表格与Python结合使用的更多信息，请参见我对类似问题的回答。您还可以下载XLSX和云端硬盘支持的其他格式的工作表。

如果您是Google API的新手，则需要进一步退后一步并首先查看以下视频：

如何使用Google API和创建API项目-用户界面已更改，但概念仍然相同
授权样板代码演练(Python)-您可以使用任何受支持的语言来访问Google API；如果您不使用Python，请使用它作为伪代码来帮助您入门
在Google云端硬盘中列出您的文件并进行代码深入探讨

如果您已经有使用G Suite API的经验，并且想观看有关同时使用这两种API的更多视频，请：

Sheets API视频库
云端硬盘API视频库

从gdata 2.0.1.4开始，此功能不再起作用：

1	gd_client.SetClientLoginToken(spreadsheets_client.GetClientLoginToken())

相反，您必须执行以下操作：

1	gd_client.SetClientLoginToken(gdata.gauth.ClientLoginToken(spreadsheets_client.GetClientLoginToken()))

相关讨论

(12月16日)尝试我写的另一个库：pygsheets。它类似于gspread，但使用的是Google api v4。它具有export方法来导出电子表格。

1
2
3
4
5
6
7
8
9
10

import pygsheets

gc = pygsheets.authorize()

# Open spreadsheet and then workseet
sh = gc.open('my new ssheet')
wks = sh.sheet1

#export as csv
wks.export(pygsheets.ExportType.CSV)

以下代码适用于我的情况(Ubuntu 10.4，python 2.6.5 gdata 2.0.14)

1
2
3
4
5
6
7
8
9
10
11
12

import gdata.docs.service
import gdata.spreadsheet.service
gd_client = gdata.docs.service.DocsService()
gd_client.ClientLogin(email,password)
spreadsheets_client = gdata.spreadsheet.service.SpreadsheetsService()
spreadsheets_client.ClientLogin(email,password)
#...
file_path = file_path.strip()+".xls"
docs_token = gd_client.auth_token
gd_client.SetClientLoginToken(spreadsheets_client.GetClientLoginToken())
gd_client.Export(entry, file_path)
gd_client.auth_token = docs_token

使用工作表从Google文档下载电子表格非常简单。

您可以按照以下详细文档

https://pypi.org/project/gsheets/

或按照以下步骤操作。我建议您通读文档以获得更好的覆盖范围。

点安装gsheets

使用您要访问其电子表格的Google帐户登录到Google Developers Console。创建(或选择)一个项目并启用Drive API和Sheets API(在Google Apps API下)。

转到项目的凭据，然后创建其他类型的新凭据> OAuth客户端ID>。在OAuth 2.0客户端ID的列表中，单击刚刚创建的客户端ID的下载JSON。将文件另存为主目录(用户目录)中的client_secrets.json。

使用以下代码段。

1
2
3
4
5
6
7
8

from gsheets import Sheets
sheets = Sheets.from_files('client_secret.json')
print(sheets) # will ensure authenticate connection

s = sheets.get("{SPREADSHEET_URL}")
print(s) # will ensure your file is accessible

s.sheets[1].to_csv('Spam.csv', encoding='utf-8', dialect='excel') # will download the file as csv

我正在使用这个：
在设置为公开可读的工作表上卷曲" https://docs.google.com/spreadsheets/d/1-lqLuYJyHAKix-T8NR8wV8ZUUbVOJrZTysccid2-ycs/gviz/tq?tqx=out:csv"。

因此，如果可以使用公共工作表，则需要使用curl的python版本。

如果您的工作表带有不希望显示的选项卡，请创建一个新工作表，然后将要发布的范围导入到该工作表的选项卡中。

相关讨论

通过消除不必要的面向对象，我进一步简化了@Cameron的答案。这使代码更小，更易于理解。我还编辑了URL，这可能会更好。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

#!/usr/bin/python
import re, urllib, urllib2

def get_auth_token(email, password):
url ="https://www.google.com/accounts/ClientLogin"
params = {
"Email": email,"Passwd": password,
"service": 'wise',
"accountType":"HOSTED_OR_GOOGLE",
"source": 'Client'
}
req = urllib2.Request(url, urllib.urlencode(params))
return re.findall(r"Auth=(.*)", urllib2.urlopen(req).read())[0]

def download(spreadsheet, worksheet, email, password, format="csv"):
url_format = 'https://docs.google.com/spreadsheets/d/%s/export?exportFormat=%s#gid=%s'

headers = {
"Authorization":"GoogleLogin auth=" + get_auth_token(email, password),
"GData-Version":"3.0"
}
req = urllib2.Request(url_format % (spreadsheet, format, worksheet), headers=headers)
return urllib2.urlopen(req)

if __name__ =="__main__":
import getpass
import csv

spreadsheet_id ="" # (spreadsheet id here)
worksheet_id = '' # (gid here)
email ="" # (your email here)
password = getpass.getpass()

# Request a file-like object containing the spreadsheet's contents
csv_file = download(spreadsheet_id, worksheet_id, email, password)

# Parse as CSV and print the rows
for row in csv.reader(csv_file):
print",".join(row)

相关讨论

(2019年3月，Python 3)我的数据通常不敏感，我通常使用类似于CSV的表格式。

在这种情况下，可以简单地publish to the web该工作表，然后将其用作服务器上的CSV文件。

(一个使用File-> Publish to the web ...-> Sheet 1-> Comma separated values (.csv)-> Publish发布)。

1
2
3
4
5
6
7
8
9
10
11

import csv
import io
import requests

url ="https://docs.google.com/spreadsheets/d/e/<GOOGLE_ID>/pub?gid=0&single=true&output=csv" # you can get the whole link in the 'Publish to the web' dialog
r = requests.get(url)
r.encoding = 'utf-8'
csvio = io.StringIO(r.text, newline="")
data = []
for row in csv.DictReader(csvio):
data.append(row)

Gspread确实比GoogleCL和Gdata有了很大的改进(我已经使用了这两个方法，但值得庆幸的是逐步淘汰了Gspread)。我认为这段代码比以前的答案要快得多，可以得到工作表的内容：

1
2
3
4
5
6
7
8
9
10
11

username = '[email protected]'
password = 'sdfsdfsadfsdw'
sheetname ="Sheety Sheet"

client = gspread.login(username, password)
spreadsheet = client.open(sheetname)

worksheet = spreadsheet.sheet1
contents = []
for rows in worksheet.get_all_values():
contents.append(rows)

这不是一个完整的答案，但是Andreas Kahler使用Google Docs + Google App Engline + Python编写了一个有趣的CMS解决方案。没有该领域的经验，我无法确切看到代码的哪一部分可能对您有用，但请检查一下。我知道它可以与Google Docs帐户连接并可以播放文件，因此我感觉您会知道发生了什么。它至少应该为您指明正确的方向。

Google AppEngine + Google文档+一些Python =简单CMS