Publications | Songyang Zhang

2024

The Amazon Nova family of models: Technical report and model card

Amazon Artificial General Intelligence

Amazon Technical Reports, 2024

PDF Website

2023

Possible Worlds VQA: Cross Modality Bias Reduction in Visual Question Answering Systems from a Causal View

Ali Vosoughi, Shijian Deng, Songyang Zhang, Yapeng Tian, Chenliang Xu, and Jiebo Luo

In TMM, 2023

PDF

arXiv

Latent-Shift: Latent Diffusion with Temporal Shift for Efficient Text-to-Video Generation

Jie An^*, Songyang Zhang^*, Harry Yang, Sonal Gupta, Jia-Bin Huang, Jiebo Luo, and Xi Yin

arXiv preprint arXiv:2304.08477, 2023

Bib PDF Website

@article{an2023latent,
  title = {Latent-Shift: Latent Diffusion with Temporal Shift for Efficient Text-to-Video Generation},
  author = {An, Jie and Zhang, Songyang and Yang, Harry and Gupta, Sonal and Huang, Jia-Bin and Luo, Jiebo and Yin, Xi},
  journal = {arXiv preprint arXiv:2304.08477},
  year = {2023}
}

2022

EMNLP

Learning a Grammar Inducer by Watching Millions of Instructional YouTube Videos

Songyang Zhang, Linfeng Song, Lifeng Jin, Haitao Mi, Kun Xu, Dong Yu, and Jiebo Luo

In Conference on Empirical Methods in Natural Language Processing, 2022

Bib PDF Code Slides

@inproceedings{zhang2022training,
  oral = {ture},
  talk = {www.youtube.com/watch?v=7caDMC24oro},
  title = {Learning a Grammar Inducer by Watching Millions of Instructional YouTube Videos},
  author = {Zhang, Songyang and Song, Linfeng and Jin, Lifeng and Mi, Haitao and Xu, Kun and Yu, Dong and Luo, Jiebo},
  booktitle = {Conference on Empirical Methods in Natural Language Processing},
  year = {2022}
}

ICLR

Make-A-Video: Text-to-video Generation without Text-Video Data

Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, Devi Parikh, Sonal Gupta, and Yaniv Taigman

In International Conference on Learning Representations, 2022

Bib PDF Website

@inproceedings{singer2022make,
  news = {https://ai.facebook.com/blog/generative-ai-text-to-video/},
  title = {Make-A-Video: Text-to-video Generation without Text-Video Data},
  author = {Singer, Uriel and Polyak, Adam and Hayes, Thomas and Yin, Xi and An, Jie and Zhang, Songyang and Hu, Qiyuan and Yang, Harry and Ashual, Oron and Gafni, Oran and Parikh, Devi and Gupta, Sonal and Taigman, Yaniv},
  booktitle = {International Conference on Learning Representations},
  year = {2022}
}

BMVC

Rethinking the Evaluation of Unbiased Scene Graph Generation

Xingchen Li, Long Chen, Jian Shao, Shaoning Xiao, Songyang Zhang, and Jun Xiao

In British Machine Vision Conference, 2022

Bib PDF Code

@inproceedings{li2022rethinking,
  oral = {true},
  title = {Rethinking the Evaluation of Unbiased Scene Graph Generation},
  author = {Li, Xingchen and Chen, Long and Shao, Jian and Xiao, Shaoning and Zhang, Songyang and Xiao, Jun},
  booktitle = {British Machine Vision Conference},
  year = {2022}
}

ECCV

MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and GENeration

Thomas Hayes^*, Songyang Zhang^*, Xi Yin, Guan Pang, Sasha Sheng, Harry Yang, Songwei Ge, Qiyuan Hu, and Devi Parikh

In European Conference on Computer Vision, 2022

Bib PDF Supp Code Slides Website

@inproceedings{hayes2022mugen,
  news = {https://ai.facebook.com/blog/introducing-mugen-a-new-dataset-for-multimodal-research/},
  talk = {https://youtu.be/it0r6Q9a1jY},
  title = {MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and GENeration},
  author = {Hayes, Thomas and Zhang, Songyang and Yin, Xi and Pang, Guan and Sheng, Sasha and Yang, Harry and Ge, Songwei and Hu, Qiyuan and Parikh, Devi},
  booktitle = {European Conference on Computer Vision},
  year = {2022}
}

ECCV

Expanding Language-Image Pretrained Models for General Video Recognition

Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, and Haibin Ling

In European Conference on Computer Vision, 2022

Bib PDF Code

@inproceedings{ni2022expanding,
  oral = {true},
  title = {Expanding Language-Image Pretrained Models for General Video Recognition},
  author = {Ni, Bolin and Peng, Houwen and Chen, Minghao and Zhang, Songyang and Meng, Gaofeng and Fu, Jianlong and Xiang, Shiming and Ling, Haibin},
  booktitle = {European Conference on Computer Vision},
  year = {2022}
}

CVPR

The Devil is in the Labels: Noisy Label Correction for Robust Scene Graph Generation

Lin Li, Long Chen, Yifeng Huang, Zhimeng Zhang, Songyang Zhang, and Jun Xiao

In IEEE Conference on Computer Vision and Pattern Recognition, 2022

Bib PDF Code

@inproceedings{li2022devil,
  oral = {true},
  talk = {https://www.youtube.com/watch?v=vMLPZB50Vtg},
  title = {The Devil is in the Labels: Noisy Label Correction for Robust Scene Graph Generation},
  author = {Li, Lin and Chen, Long and Huang, Yifeng and Zhang, Zhimeng and Zhang, Songyang and Xiao, Jun},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
  year = {2022}
}

2021

ACMMM

Instance-wise or Class-wise? A Tale of Neighbor Shapley for Concept-based Explanation

Jiahui Li, Kun Kuang, Lin Li, Long Chen, Songyang Zhang, Jian Shao, and Jun Xiao

In ACM International Conference on Multimedia, 2021

Bib PDF

@inproceedings{yang2021sat,
  title = {Instance-wise or Class-wise? A Tale of Neighbor Shapley for Concept-based Explanation},
  author = {Li, Jiahui and Kuang, Kun and Li, Lin and Chen, Long and Zhang, Songyang and Shao, Jian and Xiao, Jun},
  booktitle = {ACM International Conference on Multimedia},
  year = {2021}
}

ICCV

SAT: 2D Semantics Assisted Training for 3D Visual Grounding

Zhengyuan Yang, Songyang Zhang, Liwei Wang, and Jiebo Luo

In IEEE International Conference on Computer Vision, 2021

Bib PDF Code

@inproceedings{yang2021sau,
  oral = {true},
  title = {SAT: 2D Semantics Assisted Training for 3D Visual Grounding},
  author = {Yang, Zhengyuan and Zhang, Songyang and Wang, Liwei and Luo, Jiebo},
  booktitle = {IEEE International Conference on Computer Vision},
  year = {2021}
}

NAACL

Video-aided Unsupervised Grammar Induction

Songyang Zhang, Linfeng Song, Lifeng Jin, Kun Xu, Dong Yu, and Jiebo Luo

In Conference of the North American Chapter of the Association for Computational Linguistics, 2021

Awarded Bib PDF Code Poster Slides

Best Long Paper Award

@inproceedings{zhang2021video,
  talk = {https://underline.io/lecture/19921-video-aided-unsupervised-grammar-induction},
  news = {https://mp.weixin.qq.com/s/bzh7lbcEzfwzRsDmOA1GsQ},
  author = {Zhang, Songyang and Song, Linfeng and Jin, Lifeng and Xu, Kun and Yu, Dong and Luo, Jiebo},
  title = {Video-aided Unsupervised Grammar Induction},
  booktitle = {Conference of the North American Chapter of the Association for Computational Linguistics},
  year = {2021}
}

AAAI

Boundary Proposal Network for Two-Stage Natural Language Video Localization

Shaoning Xiao, Long Chen, Songyang Zhang, Wei Ji, Jian Shao, Lu Ye, and Jun Xiao

In the AAAI Conference on Artificial Intelligence, 2021

Bib PDF

@inproceedings{xiao2021boundary,
  title = {Boundary Proposal Network for Two-Stage Natural Language Video Localization},
  author = {Xiao, Shaoning and Chen, Long and Zhang, Songyang and Ji, Wei and Shao, Jian and Ye, Lu and Xiao, Jun},
  booktitle = {the AAAI Conference on Artificial Intelligence},
  year = {2021}
}

BigData

Mi YouTube es Su YouTube? Analyzing the Cultures using YouTube Thumbnails of Popular Videos

Songyang Zhang, Tolga Aktas, and Jiebo Luo

In IEEE Big Data, 2021

Bib PDF Slides

@inproceedings{zhang2021mi,
  title = {Mi YouTube es Su YouTube? Analyzing the Cultures using YouTube Thumbnails of Popular Videos},
  author = {Zhang, Songyang and Aktas, Tolga and Luo, Jiebo},
  booktitle = {IEEE Big Data},
  year = {2021}
}

TPAMI

Multi-Scale 2D Temporal Adjacency Networks for Moment Localization with Natural Language

Songyang Zhang, Houwen Peng, Jianlong Fu, Yijuan Lu, and Jiebo Luo

IEEE Transactions on Pattern Analysis and Machine Intelligence, 2021

Bib PDF Code

@article{zhang2020multi,
  title = {Multi-Scale 2D Temporal Adjacency Networks for Moment Localization with Natural Language},
  author = {Zhang, Songyang and Peng, Houwen and Fu, Jianlong and Lu, Yijuan and Luo, Jiebo},
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  year = {2021}
}

2020

BigData

Content-based Analysis of the Cultural Differences between TikTok and Douyin

Li Sun^*, Haoqi Zhang^*, Songyang Zhang, and Jiebo Luo

In IEEE Big Data, 2020

Bib PDF

@inproceedings{sun2020content,
  title = {Content-based Analysis of the Cultural Differences between TikTok and Douyin},
  author = {Sun, Li and Zhang, Haoqi and Zhang, Songyang and Luo, Jiebo},
  booktitle = {IEEE Big Data},
  year = {2020}
}

ICPR

Global Image Sentiment Transfer

Jie An, Tianlang Chen, Songyang Zhang, and Jiebo Luo

In International Conference on Pattern Recognition, 2020

Bib PDF

@inproceedings{an2020global,
  news = {https://www.jiqizhixin.com/articles/2020-11-09-4},
  title = {Global Image Sentiment Transfer},
  author = {An, Jie and Chen, Tianlang and Zhang, Songyang and Luo, Jiebo},
  booktitle = {International Conference on Pattern Recognition},
  year = {2020}
}

AAAI

Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language

Songyang Zhang, Houwen Peng, Jianlong Fu, and Jiebo Luo

In the AAAI Conference on Artificial Intelligence, 2020

Bib PDF Code Poster Slides

@inproceedings{zhang2020learning,
  news = {https://zhuanlan.zhihu.com/p/269968876},
  author = {Zhang, Songyang and Peng, Houwen and Fu, Jianlong and Luo, Jiebo},
  title = {Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language},
  booktitle = {the AAAI Conference on Artificial Intelligence},
  year = {2020}
}

2019

arXiv

Learning Sparse 2D Temporal Adjacent Networks for Temporal Action Localization

Songyang Zhang, Houwen Peng, Le Yang, Jianlong Fu, and Jiebo Luo

arXiv preprint arXiv:1912.03612, 2019

Awarded Bib PDF Website

Winner of HACS Temporal Action Localization Challenge at ICCV 2019

@article{zhang2019learning,
  title = {Learning Sparse 2D Temporal Adjacent Networks for Temporal Action Localization},
  author = {Zhang, Songyang and Peng, Houwen and Yang, Le and Fu, Jianlong and Luo, Jiebo},
  journal = {arXiv preprint arXiv:1912.03612},
  year = {2019}
}

ACMMM

Exploiting Temporal Relationships in Video Moment Localization with Natural Language

Songyang Zhang, Jinsong Su, and Jiebo Luo

In ACM International Conference on Multimedia, 2019

Bib PDF Code Poster Slides

@inproceedings{zhang2019exploiting,
  title = {Exploiting Temporal Relationships in Video Moment Localization with Natural Language},
  author = {Zhang, Songyang and Su, Jinsong and Luo, Jiebo},
  booktitle = {ACM International Conference on Multimedia},
  year = {2019}
}

2018

TMM

Fusing Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks

Songyang Zhang, Yang Yang, Jun Xiao, Xiaoming Liu, Yi Yang, Di Xie, and Yueting Zhuang

IEEE Transactions on Multimedia, 2018

Bib PDF

@article{zhang2018fusing,
  title = {Fusing Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks},
  author = {Zhang, Songyang and Yang, Yang and Xiao, Jun and Liu, Xiaoming and Yang, Yi and Xie, Di and Zhuang, Yueting},
  journal = {IEEE Transactions on Multimedia},
  year = {2018}
}

2017

WACV

On Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks

Songyang Zhang, Xiaoming Liu, and Jun Xiao

In IEEE Winter Conference on Applications of Computer Vision, 2017

Bib PDF Code Poster Slides

@inproceedings{zhang2017geometric,
  title = {On Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks},
  author = {Zhang, Songyang and Liu, Xiaoming and Xiao, Jun},
  booktitle = {IEEE Winter Conference on Applications of Computer Vision},
  year = {2017}
}